diff options
Diffstat (limited to 'llvm/test/CodeGen')
228 files changed, 23860 insertions, 16157 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll index 55cf48e..d1a6584a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-switch-split.ll @@ -9,7 +9,7 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0 declare i32 @logg(...) -define i32 @scanfile(i32 %call148) { +define i32 @scanfile(i32 %call148, ptr %p) { ; CHECK-LABEL: scanfile: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill @@ -26,7 +26,7 @@ define i32 @scanfile(i32 %call148) { ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: ret ; CHECK-NEXT: LBB0_3: ; %entry -; CHECK-NEXT: b.eq LBB0_2 +; CHECK-NEXT: b.eq LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %entry ; CHECK-NEXT: cmp w8, #2 ; CHECK-NEXT: b.eq LBB0_6 @@ -46,6 +46,10 @@ define i32 @scanfile(i32 %call148) { ; CHECK-NEXT: LBB0_9: ; %sw.bb150 ; CHECK-NEXT: bl _logg ; CHECK-NEXT: brk #0x1 +; CHECK-NEXT: LBB0_10: ; %sw.bb178 +; CHECK-NEXT: str wzr, [x1] +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret entry: switch i32 %call148, label %common.ret [ i32 -1, label %sw.bb @@ -80,7 +84,7 @@ sw.bb152: ; preds = %entry br label %common.ret sw.bb178: ; preds = %entry - call void @llvm.lifetime.start.p0(i64 0, ptr null) + store i32 0, ptr %p br label %common.ret } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir new file mode 100644 index 0000000..8552931 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir @@ -0,0 +1,109 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple aarch64 -passes="print<gisel-value-tracking>" %s -o - 2>&1 | FileCheck %s + +--- +name: Cst +body: | + bb.1: + ; CHECK-LABEL: name: @Cst + ; CHECK-NEXT: %0:_ KnownBits:10000000 SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %2:_ KnownBits:11110000 SignBits:4 + %0:_(s8) = G_CONSTANT i8 128 + %1:_(s8) = G_CONSTANT i8 3 + %2:_(s8) = G_ASHR %0, %1 +... +--- +name: CstBig +body: | + bb.1: + ; CHECK-LABEL: name: @CstBig + ; CHECK-NEXT: %0:_ KnownBits:11111000 SignBits:5 + ; CHECK-NEXT: %1:_ KnownBits:00000110 SignBits:5 + ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8 + %0:_(s8) = G_CONSTANT i8 248 + %1:_(s8) = G_CONSTANT i8 6 + %2:_(s8) = G_ASHR %0, %1 +... +--- +name: ScalarVar +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarVar + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1 + %0:_(s8) = COPY $b0 + %1:_(s8) = COPY $b1 + %2:_(s8) = G_ASHR %0, %1 +... +--- +name: ScalarCst +body: | + bb.1: + ; CHECK-LABEL: name: @ScalarCst + ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6 + ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:4 + %0:_(s8) = COPY $b0 + %1:_(s8) = G_CONSTANT i8 3 + %2:_(s8) = G_ASHR %0, %1 +... +--- +name: VectorVar +body: | + bb.1: + ; CHECK-LABEL: name: @VectorVar + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %1:_(<4 x s16>) = COPY $d1 + %2:_(<4 x s16>) = G_ASHR %0, %1 +... +--- +name: VectorCst +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCst + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:4 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 3 + %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1 + %3:_(<4 x s16>) = G_ASHR %0, %2 +... +--- +name: VectorCst36 +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCst36 + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000110 SignBits:13 + ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13 + ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:4 + %0:_(<4 x s16>) = COPY $d0 + %1:_(s16) = G_CONSTANT i16 3 + %2:_(s16) = G_CONSTANT i16 6 + %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1 + %4:_(<4 x s16>) = G_ASHR %0, %3 +... +--- +name: VectorCst3unknown +body: | + bb.1: + ; CHECK-LABEL: name: @VectorCst3unknown + ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14 + ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1 + %0:_(<4 x s16>) = COPY $d0 + %2:_(s16) = COPY $h0 + %1:_(s16) = G_CONSTANT i16 3 + %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1 + %4:_(<4 x s16>) = G_ASHR %0, %3 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index bd2d8c09..5c164bf 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -71,12 +71,13 @@ # DEBUG-NEXT: .. the first uncovered imm index: 0, OK # # DEBUG-NEXT: G_ABDS (opcode 65): 1 type index, 0 imm indices -# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined -# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # -# DEBUG-NEXT:G_ABDU (opcode 66): 1 type index, 0 imm indices -# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined -# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ABDU (opcode 66): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # # DEBUG-NEXT: G_IMPLICIT_DEF (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: {{[0-9]+}}, OK diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll index be79135..747db39 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -14,10 +14,10 @@ define <8 x i16> @dupsext_v8i8_v8i16(i8 %src, <8 x i8> %b) { ; CHECK-GI-LABEL: dupsext_v8i8_v8i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: lsl w8, w0, #8 -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: dup v1.8h, w8 -; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: xtn v1.8b, v1.8h +; CHECK-GI-NEXT: smull v0.8h, v1.8b, v0.8b ; CHECK-GI-NEXT: ret entry: %in = sext i8 %src to i16 diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll index ff7872c..83530049a 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll @@ -87,46 +87,17 @@ entry: } define void @memset_10_zeroval_volatile(ptr %dst) { -; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_zeroval_volatile: -; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8 -; GISel-WITHOUT-MOPS-O0-NEXT: mov w1, wzr -; GISel-WITHOUT-MOPS-O0-NEXT: bl memset -; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; GISel-WITHOUT-MOPS-O0-NEXT: ret -; -; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_zeroval_volatile: -; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O3-NEXT: mov w1, wzr -; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa -; GISel-WITHOUT-MOPS-O3-NEXT: bl memset -; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; GISel-WITHOUT-MOPS-O3-NEXT: ret -; -; GISel-MOPS-O0-LABEL: memset_10_zeroval_volatile: -; GISel-MOPS-O0: // %bb.0: // %entry -; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8 -; GISel-MOPS-O0-NEXT: mov x9, xzr -; GISel-MOPS-O0-NEXT: setp [x0]!, x8!, x9 -; GISel-MOPS-O0-NEXT: setm [x0]!, x8!, x9 -; GISel-MOPS-O0-NEXT: sete [x0]!, x8!, x9 -; GISel-MOPS-O0-NEXT: ret +; GISel-WITHOUT-MOPS-LABEL: memset_10_zeroval_volatile: +; GISel-WITHOUT-MOPS: // %bb.0: // %entry +; GISel-WITHOUT-MOPS-NEXT: str xzr, [x0] +; GISel-WITHOUT-MOPS-NEXT: strh wzr, [x0, #8] +; GISel-WITHOUT-MOPS-NEXT: ret ; -; GISel-MOPS-O3-LABEL: memset_10_zeroval_volatile: -; GISel-MOPS-O3: // %bb.0: // %entry -; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O3-NEXT: setp [x0]!, x8!, xzr -; GISel-MOPS-O3-NEXT: setm [x0]!, x8!, xzr -; GISel-MOPS-O3-NEXT: sete [x0]!, x8!, xzr -; GISel-MOPS-O3-NEXT: ret +; GISel-MOPS-LABEL: memset_10_zeroval_volatile: +; GISel-MOPS: // %bb.0: // %entry +; GISel-MOPS-NEXT: str xzr, [x0] +; GISel-MOPS-NEXT: strh wzr, [x0, #8] +; GISel-MOPS-NEXT: ret ; ; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_zeroval_volatile: ; SDAG-WITHOUT-MOPS-O2: // %bb.0: // %entry @@ -490,43 +461,46 @@ entry: define void @memset_10_volatile(ptr %dst, i32 %value) { ; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_volatile: ; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8 -; GISel-WITHOUT-MOPS-O0-NEXT: bl memset -; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; GISel-WITHOUT-MOPS-O0-NEXT: // implicit-def: $x8 +; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, w1 +; GISel-WITHOUT-MOPS-O0-NEXT: and x8, x8, #0xff +; GISel-WITHOUT-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101 +; GISel-WITHOUT-MOPS-O0-NEXT: mul x8, x8, x9 +; GISel-WITHOUT-MOPS-O0-NEXT: str x8, [x0] +; GISel-WITHOUT-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8 +; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8] ; GISel-WITHOUT-MOPS-O0-NEXT: ret ; ; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_volatile: ; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa -; GISel-WITHOUT-MOPS-O3-NEXT: bl memset -; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; GISel-WITHOUT-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1 +; GISel-WITHOUT-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101 +; GISel-WITHOUT-MOPS-O3-NEXT: and x9, x1, #0xff +; GISel-WITHOUT-MOPS-O3-NEXT: mul x8, x9, x8 +; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0] +; GISel-WITHOUT-MOPS-O3-NEXT: strh w8, [x0, #8] ; GISel-WITHOUT-MOPS-O3-NEXT: ret ; ; GISel-MOPS-O0-LABEL: memset_10_volatile: ; GISel-MOPS-O0: // %bb.0: // %entry -; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8 -; GISel-MOPS-O0-NEXT: // implicit-def: $x9 -; GISel-MOPS-O0-NEXT: mov w9, w1 -; GISel-MOPS-O0-NEXT: setp [x0]!, x8!, x9 -; GISel-MOPS-O0-NEXT: setm [x0]!, x8!, x9 -; GISel-MOPS-O0-NEXT: sete [x0]!, x8!, x9 +; GISel-MOPS-O0-NEXT: // implicit-def: $x8 +; GISel-MOPS-O0-NEXT: mov w8, w1 +; GISel-MOPS-O0-NEXT: and x8, x8, #0xff +; GISel-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101 +; GISel-MOPS-O0-NEXT: mul x8, x8, x9 +; GISel-MOPS-O0-NEXT: str x8, [x0] +; GISel-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8 +; GISel-MOPS-O0-NEXT: strh w8, [x0, #8] ; GISel-MOPS-O0-NEXT: ret ; ; GISel-MOPS-O3-LABEL: memset_10_volatile: ; GISel-MOPS-O3: // %bb.0: // %entry -; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa ; GISel-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1 -; GISel-MOPS-O3-NEXT: setp [x0]!, x8!, x1 -; GISel-MOPS-O3-NEXT: setm [x0]!, x8!, x1 -; GISel-MOPS-O3-NEXT: sete [x0]!, x8!, x1 +; GISel-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101 +; GISel-MOPS-O3-NEXT: and x9, x1, #0xff +; GISel-MOPS-O3-NEXT: mul x8, x9, x8 +; GISel-MOPS-O3-NEXT: str x8, [x0] +; GISel-MOPS-O3-NEXT: strh w8, [x0, #8] ; GISel-MOPS-O3-NEXT: ret ; ; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_volatile: @@ -905,43 +879,21 @@ entry: } define void @memcpy_10_volatile(ptr %dst, ptr %src, i32 %value) { -; GISel-WITHOUT-MOPS-O0-LABEL: memcpy_10_volatile: -; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8 -; GISel-WITHOUT-MOPS-O0-NEXT: bl memcpy -; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; GISel-WITHOUT-MOPS-O0-NEXT: ret -; -; GISel-WITHOUT-MOPS-O3-LABEL: memcpy_10_volatile: -; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa -; GISel-WITHOUT-MOPS-O3-NEXT: bl memcpy -; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; GISel-WITHOUT-MOPS-O3-NEXT: ret -; -; GISel-MOPS-O0-LABEL: memcpy_10_volatile: -; GISel-MOPS-O0: // %bb.0: // %entry -; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8 -; GISel-MOPS-O0-NEXT: cpyfp [x0]!, [x1]!, x8! -; GISel-MOPS-O0-NEXT: cpyfm [x0]!, [x1]!, x8! -; GISel-MOPS-O0-NEXT: cpyfe [x0]!, [x1]!, x8! -; GISel-MOPS-O0-NEXT: ret +; GISel-WITHOUT-MOPS-LABEL: memcpy_10_volatile: +; GISel-WITHOUT-MOPS: // %bb.0: // %entry +; GISel-WITHOUT-MOPS-NEXT: ldr x8, [x1] +; GISel-WITHOUT-MOPS-NEXT: str x8, [x0] +; GISel-WITHOUT-MOPS-NEXT: ldrh w8, [x1, #8] +; GISel-WITHOUT-MOPS-NEXT: strh w8, [x0, #8] +; GISel-WITHOUT-MOPS-NEXT: ret ; -; GISel-MOPS-O3-LABEL: memcpy_10_volatile: -; GISel-MOPS-O3: // %bb.0: // %entry -; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O3-NEXT: cpyfp [x0]!, [x1]!, x8! -; GISel-MOPS-O3-NEXT: cpyfm [x0]!, [x1]!, x8! -; GISel-MOPS-O3-NEXT: cpyfe [x0]!, [x1]!, x8! -; GISel-MOPS-O3-NEXT: ret +; GISel-MOPS-LABEL: memcpy_10_volatile: +; GISel-MOPS: // %bb.0: // %entry +; GISel-MOPS-NEXT: ldr x8, [x1] +; GISel-MOPS-NEXT: str x8, [x0] +; GISel-MOPS-NEXT: ldrh w8, [x1, #8] +; GISel-MOPS-NEXT: strh w8, [x0, #8] +; GISel-MOPS-NEXT: ret ; ; SDAG-WITHOUT-MOPS-O2-LABEL: memcpy_10_volatile: ; SDAG-WITHOUT-MOPS-O2: // %bb.0: // %entry @@ -1736,40 +1688,34 @@ entry: define void @memmove_10_volatile(ptr %dst, ptr %src, i32 %value) { ; GISel-WITHOUT-MOPS-O0-LABEL: memmove_10_volatile: ; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O0-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O0-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-WITHOUT-MOPS-O0-NEXT: mov w2, w8 -; GISel-WITHOUT-MOPS-O0-NEXT: bl memmove -; GISel-WITHOUT-MOPS-O0-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; GISel-WITHOUT-MOPS-O0-NEXT: ldr x9, [x1] +; GISel-WITHOUT-MOPS-O0-NEXT: ldrh w8, [x1, #8] +; GISel-WITHOUT-MOPS-O0-NEXT: str x9, [x0] +; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8] ; GISel-WITHOUT-MOPS-O0-NEXT: ret ; ; GISel-WITHOUT-MOPS-O3-LABEL: memmove_10_volatile: ; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry -; GISel-WITHOUT-MOPS-O3-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_def_cfa_offset 16 -; GISel-WITHOUT-MOPS-O3-NEXT: .cfi_offset w30, -16 -; GISel-WITHOUT-MOPS-O3-NEXT: mov w2, #10 // =0xa -; GISel-WITHOUT-MOPS-O3-NEXT: bl memmove -; GISel-WITHOUT-MOPS-O3-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; GISel-WITHOUT-MOPS-O3-NEXT: ldr x8, [x1] +; GISel-WITHOUT-MOPS-O3-NEXT: ldrh w9, [x1, #8] +; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0] +; GISel-WITHOUT-MOPS-O3-NEXT: strh w9, [x0, #8] ; GISel-WITHOUT-MOPS-O3-NEXT: ret ; ; GISel-MOPS-O0-LABEL: memmove_10_volatile: ; GISel-MOPS-O0: // %bb.0: // %entry -; GISel-MOPS-O0-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O0-NEXT: // kill: def $x8 killed $w8 -; GISel-MOPS-O0-NEXT: cpyp [x0]!, [x1]!, x8! -; GISel-MOPS-O0-NEXT: cpym [x0]!, [x1]!, x8! -; GISel-MOPS-O0-NEXT: cpye [x0]!, [x1]!, x8! +; GISel-MOPS-O0-NEXT: ldr x9, [x1] +; GISel-MOPS-O0-NEXT: ldrh w8, [x1, #8] +; GISel-MOPS-O0-NEXT: str x9, [x0] +; GISel-MOPS-O0-NEXT: strh w8, [x0, #8] ; GISel-MOPS-O0-NEXT: ret ; ; GISel-MOPS-O3-LABEL: memmove_10_volatile: ; GISel-MOPS-O3: // %bb.0: // %entry -; GISel-MOPS-O3-NEXT: mov w8, #10 // =0xa -; GISel-MOPS-O3-NEXT: cpyp [x0]!, [x1]!, x8! -; GISel-MOPS-O3-NEXT: cpym [x0]!, [x1]!, x8! -; GISel-MOPS-O3-NEXT: cpye [x0]!, [x1]!, x8! +; GISel-MOPS-O3-NEXT: ldr x8, [x1] +; GISel-MOPS-O3-NEXT: ldrh w9, [x1, #8] +; GISel-MOPS-O3-NEXT: str x8, [x0] +; GISel-MOPS-O3-NEXT: strh w9, [x0, #8] ; GISel-MOPS-O3-NEXT: ret ; ; SDAG-WITHOUT-MOPS-O2-LABEL: memmove_10_volatile: diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 2f23a32..6e5c666 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -2264,33 +2264,12 @@ define <2 x i64> @lsr_const(<2 x i64> %a, <2 x i64> %b) { } define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) { -; CHECK-NEON-LABEL: asr: -; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-NEON-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: asr: -; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-SVE-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: asr: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #32 -; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #32 -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: asr: +; CHECK: // %bb.0: +; CHECK-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: ret %x = ashr <2 x i64> %a, <i64 32, i64 32> %y = ashr <2 x i64> %b, <i64 32, i64 32> %z = mul nsw <2 x i64> %x, %y @@ -2298,34 +2277,12 @@ define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) { } define <2 x i64> @asr_const(<2 x i64> %a, <2 x i64> %b) { -; CHECK-NEON-LABEL: asr_const: -; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: movi v1.2s, #31 -; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-NEON-NEXT: ret -; -; CHECK-SVE-LABEL: asr_const: -; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: movi v1.2s, #31 -; CHECK-SVE-NEXT: shrn v0.2s, v0.2d, #32 -; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-SVE-NEXT: ret -; -; CHECK-GI-LABEL: asr_const: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: adrp x8, .LCPI81_0 -; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #32 -; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI81_0] -; CHECK-GI-NEXT: fmov x10, d0 -; CHECK-GI-NEXT: fmov x11, d1 -; CHECK-GI-NEXT: mov x8, v0.d[1] -; CHECK-GI-NEXT: mov x9, v1.d[1] -; CHECK-GI-NEXT: mul x10, x10, x11 -; CHECK-GI-NEXT: mul x8, x8, x9 -; CHECK-GI-NEXT: fmov d0, x10 -; CHECK-GI-NEXT: mov v0.d[1], x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: asr_const: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2s, #31 +; CHECK-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: ret %x = ashr <2 x i64> %a, <i64 32, i64 32> %z = mul nsw <2 x i64> %x, <i64 31, i64 31> ret <2 x i64> %z diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll index e31c9a0..113eb14 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll @@ -263,3 +263,110 @@ entry: %conv = zext i1 %cmp to i8 ret i8 %conv } + +; Test ANDS. +define i32 @test1_ands(i32 %a) { +; CHECK-LABEL: test1_ands: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0x3ffc00 +; CHECK-NEXT: ands w8, w8, #0xffe007ff +; CHECK-NEXT: csel w0, w0, w8, eq +; CHECK-NEXT: ret +entry: + %ands = and i32 %a, 2098176 + %c = icmp eq i32 %ands, 0 + %r = select i1 %c, i32 %a, i32 %ands + ret i32 %r +} + +; This constant should not be split because it can be handled by one mov. +define i32 @test2_ands(i32 %a) { +; CHECK-LABEL: test2_ands: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #135 // =0x87 +; CHECK-NEXT: ands w8, w0, w8 +; CHECK-NEXT: csel w0, w0, w8, eq +; CHECK-NEXT: ret +entry: + %ands = and i32 %a, 135 + %c = icmp eq i32 %ands, 0 + %r = select i1 %c, i32 %a, i32 %ands + ret i32 %r +} + +; This constant should not be split because the split immediate is not valid +; bitmask immediate. +define i32 @test3_ands(i32 %a) { +; CHECK-LABEL: test3_ands: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1024 // =0x400 +; CHECK-NEXT: movk w8, #33, lsl #16 +; CHECK-NEXT: ands w8, w0, w8 +; CHECK-NEXT: csel w0, w0, w8, eq +; CHECK-NEXT: ret +entry: + %ands = and i32 %a, 2163712 + %c = icmp eq i32 %ands, 0 + %r = select i1 %c, i32 %a, i32 %ands + ret i32 %r +} + +define i64 @test4_ands(i64 %a) { +; CHECK-LABEL: test4_ands: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and x8, x0, #0x3ffc00 +; CHECK-NEXT: ands x8, x8, #0xffffffffffe007ff +; CHECK-NEXT: csel x0, x0, x8, eq +; CHECK-NEXT: ret +entry: + %ands = and i64 %a, 2098176 + %c = icmp eq i64 %ands, 0 + %r = select i1 %c, i64 %a, i64 %ands + ret i64 %r +} + +define i64 @test5_ands(i64 %a) { +; CHECK-LABEL: test5_ands: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and x8, x0, #0x3ffffc000 +; CHECK-NEXT: ands x8, x8, #0xfffffffe00007fff +; CHECK-NEXT: csel x0, x0, x8, eq +; CHECK-NEXT: ret +entry: + %ands = and i64 %a, 8589950976 + %c = icmp eq i64 %ands, 0 + %r = select i1 %c, i64 %a, i64 %ands + ret i64 %r +} + +; This constant should not be split because it can be handled by one mov. +define i64 @test6_ands(i64 %a) { +; CHECK-LABEL: test6_ands: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #135 // =0x87 +; CHECK-NEXT: ands x8, x0, x8 +; CHECK-NEXT: csel x0, x0, x8, eq +; CHECK-NEXT: ret +entry: + %ands = and i64 %a, 135 + %c = icmp eq i64 %ands, 0 + %r = select i1 %c, i64 %a, i64 %ands + ret i64 %r +} + +; This constant should not be split because the split immediate is not valid +; bitmask immediate. +define i64 @test7_ands(i64 %a) { +; CHECK-LABEL: test7_ands: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1024 // =0x400 +; CHECK-NEXT: movk w8, #33, lsl #16 +; CHECK-NEXT: ands x8, x0, x8 +; CHECK-NEXT: csel x0, x0, x8, eq +; CHECK-NEXT: ret +entry: + %ands = and i64 %a, 2163712 + %c = icmp eq i64 %ands, 0 + %r = select i1 %c, i64 %a, i64 %ands + ret i64 %r +} diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll index bd28d13..256ff94 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -1,5 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for test_vmull_p8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_p64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vmull_high_p64 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5 @@ -101,11 +107,18 @@ entry: } define <8 x i16> @test_vaddl_a8(<8 x i8> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vaddl_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddl_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddl_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %a to <8 x i16> %vmovl.i2.i = zext <8 x i8> %b to <8 x i16> @@ -229,11 +242,18 @@ entry: } define <8 x i16> @test_vaddl_high_a8(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vaddl_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddl2 v0.8h, v0.16b, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddl_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddl2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddl_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddl2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -345,11 +365,18 @@ entry: } define <8 x i16> @test_vaddw_a8(<8 x i16> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vaddw_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddw_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddw_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %b to <8 x i16> %add.i = add <8 x i16> %vmovl.i.i, %a @@ -458,11 +485,18 @@ entry: } define <8 x i16> @test_vaddw_high_a8(<8 x i16> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vaddw_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uaddw2 v0.8h, v0.8h, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddw_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: uaddw2 v0.8h, v0.8h, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddw_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: uaddw2 v0.8h, v0.8h, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -574,11 +608,18 @@ entry: } define <8 x i16> @test_vsubl_a8(<8 x i8> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vsubl_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubl_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubl_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %a to <8 x i16> %vmovl.i2.i = zext <8 x i8> %b to <8 x i16> @@ -702,11 +743,18 @@ entry: } define <8 x i16> @test_vsubl_high_a8(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsubl_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubl2 v0.8h, v0.16b, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubl_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubl2 v0.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubl_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubl2 v0.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -818,11 +866,18 @@ entry: } define <8 x i16> @test_vsubw_a8(<8 x i16> %a, <8 x i8> %b) { -; CHECK-LABEL: test_vsubw_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubw v0.8h, v0.8h, v1.8b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubw_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubw v0.8h, v0.8h, v1.8b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubw_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubw v0.8h, v0.8h, v1.8b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %vmovl.i.i = zext <8 x i8> %b to <8 x i16> %sub.i = sub <8 x i16> %a, %vmovl.i.i @@ -931,11 +986,18 @@ entry: } define <8 x i16> @test_vsubw_high_a8(<8 x i16> %a, <16 x i8> %b) { -; CHECK-LABEL: test_vsubw_high_a8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: usubw2 v0.8h, v0.8h, v1.16b -; CHECK-NEXT: bic v0.8h, #255, lsl #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubw_high_a8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: usubw2 v0.8h, v0.8h, v1.16b +; CHECK-SD-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubw_high_a8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff +; CHECK-GI-NEXT: usubw2 v0.8h, v0.8h, v1.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret entry: %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16> @@ -975,10 +1037,16 @@ entry: } define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <8 x i16> %a, %b %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -987,10 +1055,16 @@ entry: } define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <4 x i32> %a, %b %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -999,10 +1073,16 @@ entry: } define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <2 x i64> %a, %b %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32> @@ -1011,10 +1091,16 @@ entry: } define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <8 x i16> %a, %b %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1023,10 +1109,16 @@ entry: } define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <4 x i32> %a, %b %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -1035,10 +1127,16 @@ entry: } define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: addhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: addhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vaddhn.i = add <2 x i64> %a, %b %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32> @@ -1047,11 +1145,20 @@ entry: } define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <8 x i16> %a, %b %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1064,11 +1171,20 @@ entry: } define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <4 x i32> %a, %b %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1081,11 +1197,20 @@ entry: } define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <2 x i64> %a, %b %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32> @@ -1098,11 +1223,20 @@ entry: } define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vaddhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <8 x i16> %a, %b %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1115,11 +1249,20 @@ entry: } define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vaddhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <4 x i32> %a, %b %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1132,11 +1275,20 @@ entry: } define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vaddhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: addhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vaddhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: addhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vaddhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vaddhn.i.i = add <2 x i64> %a, %b %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32> @@ -1209,11 +1361,19 @@ entry: } define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vraddhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1224,11 +1384,19 @@ entry: } define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vraddhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1239,11 +1407,19 @@ entry: } define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vraddhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -1254,11 +1430,19 @@ entry: } define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vraddhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1269,11 +1453,19 @@ entry: } define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vraddhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1284,11 +1476,19 @@ entry: } define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vraddhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: raddhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vraddhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: raddhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vraddhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: raddhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -1299,10 +1499,16 @@ entry: } define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <8 x i16> %a, %b %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1311,10 +1517,16 @@ entry: } define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <4 x i32> %a, %b %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -1323,10 +1535,16 @@ entry: } define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <2 x i64> %a, %b %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32> @@ -1335,10 +1553,16 @@ entry: } define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.8b, v0.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.8b, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: shrn v0.8b, v0.8h, #8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <8 x i16> %a, %b %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1347,10 +1571,16 @@ entry: } define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.4h, v0.4s, v1.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.4h, v0.4s, v1.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #16 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <4 x i32> %a, %b %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16> @@ -1359,10 +1589,16 @@ entry: } define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subhn v0.2s, v0.2d, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: subhn v0.2s, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #32 +; CHECK-GI-NEXT: ret entry: %vsubhn.i = sub <2 x i64> %a, %b %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32> @@ -1371,11 +1607,20 @@ entry: } define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <8 x i16> %a, %b %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1388,11 +1633,20 @@ entry: } define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <4 x i32> %a, %b %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1405,11 +1659,20 @@ entry: } define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <2 x i64> %a, %b %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32> @@ -1422,11 +1685,20 @@ entry: } define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vsubhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <8 x i16> %a, %b %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> @@ -1439,11 +1711,20 @@ entry: } define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vsubhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <4 x i32> %a, %b %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16> @@ -1456,11 +1737,20 @@ entry: } define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vsubhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: subhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vsubhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: subhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vsubhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub v1.2d, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vsubhn.i.i = sub <2 x i64> %a, %b %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32> @@ -1533,11 +1823,19 @@ entry: } define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vrsubhn_high_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1548,11 +1846,19 @@ entry: } define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vrsubhn_high_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1563,11 +1869,19 @@ entry: } define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vrsubhn_high_s64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_s64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_s64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -1578,11 +1892,19 @@ entry: } define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_vrsubhn_high_u16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_u16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.16b, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_u16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.8b, v1.8h, v2.8h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) %0 = bitcast <8 x i8> %r to <1 x i64> @@ -1593,11 +1915,19 @@ entry: } define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_vrsubhn_high_u32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_u32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.8h, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_u32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.4h, v1.4s, v2.4s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) %0 = bitcast <4 x i16> %r to <1 x i64> @@ -1608,11 +1938,19 @@ entry: } define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_vrsubhn_high_u64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vrsubhn_high_u64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: rsubhn2 v0.4s, v1.2d, v2.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vrsubhn_high_u64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: rsubhn v1.2s, v1.2d, v2.2d +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret entry: %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) %0 = bitcast <2 x i32> %r to <1 x i64> @@ -2535,21 +2873,40 @@ entry: } define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coerce) { -; CHECK-LABEL: cmplx_mul_combined_re_im: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: movi v1.2d, #0xffff0000ffff0000 -; CHECK-NEXT: rev32 v4.8h, v0.8h -; CHECK-NEXT: dup v2.8h, w8 -; CHECK-NEXT: sqneg v3.8h, v2.8h -; CHECK-NEXT: bsl v1.16b, v2.16b, v3.16b -; CHECK-NEXT: fmov d3, x0 -; CHECK-NEXT: sqdmull v2.4s, v4.4h, v1.4h -; CHECK-NEXT: sqdmull2 v1.4s, v4.8h, v1.8h -; CHECK-NEXT: sqdmlal v2.4s, v0.4h, v3.h[0] -; CHECK-NEXT: sqdmlal2 v1.4s, v0.8h, v3.h[0] -; CHECK-NEXT: uzp2 v0.8h, v2.8h, v1.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: cmplx_mul_combined_re_im: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: lsr x8, x0, #16 +; CHECK-SD-NEXT: movi v1.2d, #0xffff0000ffff0000 +; CHECK-SD-NEXT: rev32 v4.8h, v0.8h +; CHECK-SD-NEXT: dup v2.8h, w8 +; CHECK-SD-NEXT: sqneg v3.8h, v2.8h +; CHECK-SD-NEXT: bsl v1.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: fmov d3, x0 +; CHECK-SD-NEXT: sqdmull v2.4s, v4.4h, v1.4h +; CHECK-SD-NEXT: sqdmull2 v1.4s, v4.8h, v1.8h +; CHECK-SD-NEXT: sqdmlal v2.4s, v0.4h, v3.h[0] +; CHECK-SD-NEXT: sqdmlal2 v1.4s, v0.8h, v3.h[0] +; CHECK-SD-NEXT: uzp2 v0.8h, v2.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmplx_mul_combined_re_im: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: lsr x9, x0, #16 +; CHECK-GI-NEXT: adrp x8, .LCPI196_0 +; CHECK-GI-NEXT: rev32 v4.8h, v0.8h +; CHECK-GI-NEXT: ldr q3, [x8, :lo12:.LCPI196_0] +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: dup v2.8h, v1.h[0] +; CHECK-GI-NEXT: sqneg v1.8h, v2.8h +; CHECK-GI-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: fmov d3, x0 +; CHECK-GI-NEXT: sqdmull v2.4s, v2.4h, v3.h[0] +; CHECK-GI-NEXT: sqdmull v5.4s, v4.4h, v1.4h +; CHECK-GI-NEXT: sqdmlal v5.4s, v0.4h, v3.h[0] +; CHECK-GI-NEXT: sqdmlal2 v2.4s, v4.8h, v1.8h +; CHECK-GI-NEXT: uzp2 v0.8h, v5.8h, v2.8h +; CHECK-GI-NEXT: ret entry: %scale.sroa.2.0.extract.shift23 = lshr i64 %scale.coerce, 16 %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll index cc9732b..6c7ddd9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=arm64-none-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) @@ -197,11 +198,20 @@ define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { } define <2 x i32> @test_sabd_v2i32_const() { -; CHECK-LABEL: test_sabd_v2i32_const: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_sabd_v2i32_const: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI19_0 +; CHECK-SD-NEXT: ldr d0, [x8, :lo12:.LCPI19_0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_sabd_v2i32_const: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI19_1 +; CHECK-GI-NEXT: adrp x9, .LCPI19_0 +; CHECK-GI-NEXT: ldr d0, [x8, :lo12:.LCPI19_1] +; CHECK-GI-NEXT: ldr d1, [x9, :lo12:.LCPI19_0] +; CHECK-GI-NEXT: sabd v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32( <2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>) @@ -293,15 +303,26 @@ define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) { } define <8 x i16> @test_uabd_knownbits_vec8i16(<8 x i16> %lhs, <8 x i16> %rhs) { -; CHECK-LABEL: test_uabd_knownbits_vec8i16: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.8h, #15 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: uabd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: rev64 v0.8h, v0.8h -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_uabd_knownbits_vec8i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v2.8h, #15 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: uabd v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: rev64 v0.8h, v0.8h +; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_uabd_knownbits_vec8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.8h, #15 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: uabd v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: rev64 v0.8h, v0.8h +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: ret %and1 = and <8 x i16> %lhs, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> %and2 = and <8 x i16> %rhs, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> %uabd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %and1, <8 x i16> %and2) @@ -311,11 +332,22 @@ define <8 x i16> @test_uabd_knownbits_vec8i16(<8 x i16> %lhs, <8 x i16> %rhs) { } define <4 x i32> @knownbits_uabd_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_uabd_mask_and_shuffle_lshr: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushr v0.4s, v0.4s, #17 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_uabd_mask_and_shuffle_lshr: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ushr v0.4s, v0.4s, #17 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_uabd_mask_and_shuffle_lshr: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: rev64 v0.4s, v0.4s +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #17 +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535> %2 = and <4 x i32> %a1, <i32 65535, i32 65535, i32 65535, i32 65535> %3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %1, <4 x i32> %2) @@ -325,10 +357,19 @@ define <4 x i32> @knownbits_uabd_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> } define <4 x i32> @knownbits_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_mask_and_shuffle_lshr: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_mask_and_shuffle_lshr: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_mask_and_shuffle_lshr: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v2.4s, #127, msl #8 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #17 +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767> %2 = and <4 x i32> %a1, <i32 32767, i32 32767, i32 32767, i32 32767> %3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %1, <4 x i32> %2) @@ -338,20 +379,36 @@ define <4 x i32> @knownbits_mask_and_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) } define <4 x i32> @test_sabd_knownbits_vec4i32(<4 x i32> %lhs, <4 x i32> %rhs) { -; CHECK-LABEL: test_sabd_knownbits_vec4i32: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI31_0 -; CHECK-NEXT: adrp x9, .LCPI31_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] -; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI31_1] -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: movi v1.2d, #0x0000ff000000ff -; CHECK-NEXT: mov v0.s[1], v0.s[0] -; CHECK-NEXT: trn2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_sabd_knownbits_vec4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI31_0 +; CHECK-SD-NEXT: adrp x9, .LCPI31_1 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] +; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI31_1] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-SD-NEXT: mov v0.s[1], v0.s[0] +; CHECK-SD-NEXT: trn2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_sabd_knownbits_vec4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI31_2 +; CHECK-GI-NEXT: adrp x9, .LCPI31_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI31_2] +; CHECK-GI-NEXT: ldr q3, [x9, :lo12:.LCPI31_1] +; CHECK-GI-NEXT: adrp x8, .LCPI31_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI31_0] +; CHECK-GI-NEXT: movi v3.2d, #0x0000ff000000ff +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: ret %and1 = and <4 x i32> %lhs, <i32 255, i32 -1, i32 -1, i32 255> %and2 = and <4 x i32> %rhs, <i32 255, i32 255, i32 -1, i32 -1> %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %and1, <4 x i32> %and2) @@ -361,15 +418,27 @@ define <4 x i32> @test_sabd_knownbits_vec4i32(<4 x i32> %lhs, <4 x i32> %rhs) { } define <4 x i32> @knownbits_sabd_and_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI32_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI32_0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: zip2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI32_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI32_1] +; CHECK-GI-NEXT: adrp x8, .LCPI32_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> %2 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085> %3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %1, <4 x i32> %2) @@ -378,10 +447,25 @@ define <4 x i32> @knownbits_sabd_and_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_or_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_or_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_or_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_or_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI33_1 +; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_1] +; CHECK-GI-NEXT: adrp x8, .LCPI33_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI33_0] +; CHECK-GI-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> %2 = or <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535> %3 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085> @@ -392,18 +476,33 @@ define <4 x i32> @knownbits_sabd_and_or_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_xor_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_xor_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI34_0 -; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b -; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_xor_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI34_0 +; CHECK-SD-NEXT: movi v3.2d, #0x00ffff0000ffff +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] +; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: eor v0.16b, v0.16b, v3.16b +; CHECK-SD-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: zip2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_xor_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI34_1 +; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_1] +; CHECK-GI-NEXT: adrp x8, .LCPI34_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI34_0] +; CHECK-GI-NEXT: eor v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: eor v1.16b, v1.16b, v3.16b +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> %2 = xor <4 x i32> %1, <i32 65535, i32 65535, i32 65535, i32 65535> %3 = and <4 x i32> %a1, <i32 -1, i32 -1, i32 255, i32 4085> @@ -414,10 +513,24 @@ define <4 x i32> @knownbits_sabd_and_xor_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_shl_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_shl_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_shl_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_shl_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI35_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_1] +; CHECK-GI-NEXT: adrp x8, .LCPI35_0 +; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI35_0] +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #17 +; CHECK-GI-NEXT: shl v1.4s, v1.4s, #17 +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536> %2 = shl <4 x i32> %1, <i32 17, i32 17, i32 17, i32 17> %3 = and <4 x i32> %a1, <i32 -65536, i32 -7, i32 -7, i32 -65536> @@ -428,18 +541,32 @@ define <4 x i32> @knownbits_sabd_and_shl_mask(<4 x i32> %a0, <4 x i32> %a1) { } define <4 x i32> @knownbits_sabd_and_mul_mask(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: knownbits_sabd_and_mul_mask: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI36_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] -; CHECK-NEXT: and v3.16b, v0.16b, v2.16b -; CHECK-NEXT: and v2.16b, v1.16b, v2.16b -; CHECK-NEXT: mul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: mul v1.4s, v1.4s, v2.4s -; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v0.s[1], v0.s[0] -; CHECK-NEXT: trn2 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: knownbits_sabd_and_mul_mask: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI36_0 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] +; CHECK-SD-NEXT: and v3.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: and v2.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: mul v0.4s, v0.4s, v3.4s +; CHECK-SD-NEXT: mul v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: mov v0.s[1], v0.s[0] +; CHECK-SD-NEXT: trn2 v0.4s, v0.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: knownbits_sabd_and_mul_mask: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI36_1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_1] +; CHECK-GI-NEXT: adrp x8, .LCPI36_0 +; CHECK-GI-NEXT: and v3.16b, v0.16b, v2.16b +; CHECK-GI-NEXT: and v2.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: mul v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: mul v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI36_0] +; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: ret %1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536> %2 = mul <4 x i32> %a0, %1 %3 = and <4 x i32> %a1, <i32 -65536, i32 -7, i32 -7, i32 -65536> diff --git a/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir b/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir new file mode 100644 index 0000000..23ac67c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/bsp_implicit_ops.mir @@ -0,0 +1,98 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s + + +--- +name: BSL_COPY +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7 + + + ; CHECK-LABEL: name: BSL_COPY + ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q2 = ORRv16i8 killed renamable $q20, killed renamable $q20 + ; CHECK-NEXT: renamable $q2 = BSLv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3 + ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0 + ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1 + ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2 + ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3 + ; CHECK-NEXT: RET undef $lr, implicit $q22 + renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3 + $q22 = ORRv16i8 $q0, killed $q0 + $q23 = ORRv16i8 $q1, killed $q1 + $q24 = ORRv16i8 $q2, killed $q2 + $q25 = ORRv16i8 $q3, killed $q3 + RET_ReallyLR implicit $q22 +... +--- +name: BSL +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7 + + ; CHECK-LABEL: name: BSL + ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q2 = BSLv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3 + ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0 + ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1 + ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2 + ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3 + ; CHECK-NEXT: RET undef $lr, implicit $q22 + renamable $q2 = BSPv16i8 killed renamable $q2, renamable $q21, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3 + $q22 = ORRv16i8 $q0, killed $q0 + $q23 = ORRv16i8 $q1, killed $q1 + $q24 = ORRv16i8 $q2, killed $q2 + $q25 = ORRv16i8 $q3, killed $q3 + RET_ReallyLR implicit $q22 +... +--- +name: BIF +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7 + + ; CHECK-LABEL: name: BIF + ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q2 = BIFv16i8 renamable $q2, renamable $q6, killed renamable $q20, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3 + ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0 + ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1 + ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2 + ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3 + ; CHECK-NEXT: RET undef $lr, implicit $q22 + renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q2, renamable $q6, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3 + $q22 = ORRv16i8 $q0, killed $q0 + $q23 = ORRv16i8 $q1, killed $q1 + $q24 = ORRv16i8 $q2, killed $q2 + $q25 = ORRv16i8 $q3, killed $q3 + RET_ReallyLR implicit $q22 +... +--- +name: BIT +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7 + + ; CHECK-LABEL: name: BIT + ; CHECK: liveins: $q20, $q21, $q22, $q23, $q6, $q1, $q7 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q2 = BITv16i8 renamable $q2, renamable $q21, killed renamable $q20, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3 + ; CHECK-NEXT: $q22 = ORRv16i8 $q0, killed $q0 + ; CHECK-NEXT: $q23 = ORRv16i8 $q1, killed $q1 + ; CHECK-NEXT: $q24 = ORRv16i8 $q2, killed $q2 + ; CHECK-NEXT: $q25 = ORRv16i8 $q3, killed $q3 + ; CHECK-NEXT: RET undef $lr, implicit $q22 + renamable $q2 = BSPv16i8 killed renamable $q20, renamable $q21, renamable $q2, implicit killed $q21_q22_q23, implicit killed $q0_q1_q2_q3, implicit-def $q0_q1_q2_q3 + $q22 = ORRv16i8 $q0, killed $q0 + $q23 = ORRv16i8 $q1, killed $q1 + $q24 = ORRv16i8 $q2, killed $q2 + $q25 = ORRv16i8 $q3, killed $q3 + RET_ReallyLR implicit $q22 +... diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll index 2b7fa08..e1ba0e9 100644 --- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll +++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll @@ -1631,7 +1631,6 @@ define i8 @combine_i8_sdiv_const100(i8 %x) { ; CHECK-GI-NEXT: sxtb w8, w0 ; CHECK-GI-NEXT: mov w9, #41 // =0x29 ; CHECK-GI-NEXT: mul w8, w8, w9 -; CHECK-GI-NEXT: sxth w8, w8 ; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: asr w8, w8, #4 ; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll index f82d1ed..df4889b 100644 --- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll +++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-LE +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes CHECK,CHECK-LE ; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI + +; CHECK-GI: warning: Instruction selection used fallback path for test_pmull_high_p8_128 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pmull_high_p8_64 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) @@ -12,10 +16,10 @@ declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %s1, <2 x i32> %s2) define <4 x i32> @test_smull_high_s16_base(<8 x i16> %a, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_base: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_base: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_base: ; CHECK-BE: // %bb.0: // %entry @@ -35,10 +39,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcasta1: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcasta1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1: ; CHECK-BE: // %bb.0: // %entry @@ -59,10 +63,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcastb1(<8 x i16> %a, <16 x i8> %bb) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcastb1: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcastb1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb1: ; CHECK-BE: // %bb.0: // %entry @@ -83,10 +87,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcasta2: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcasta2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2: ; CHECK-BE: // %bb.0: // %entry @@ -109,10 +113,10 @@ entry: } define <4 x i32> @test_smull_high_s16_bitcastb2(<8 x i16> %a, <16 x i8> %b) #0 { -; CHECK-LE-LABEL: test_smull_high_s16_bitcastb2: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_smull_high_s16_bitcastb2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_smull_high_s16_bitcastb2: ; CHECK-BE: // %bb.0: // %entry @@ -157,6 +161,13 @@ define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcasta1_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %a = bitcast <2 x i64> %aa to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5> @@ -186,6 +197,13 @@ define <4 x i32> @test_smull_high_s16_bitcastb1_wrongindex(<8 x i16> %a, <16 x i ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcastb1_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ext v1.16b, v1.16b, v0.16b, #6 +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %b = bitcast <16 x i8> %bb to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -215,6 +233,13 @@ define <4 x i32> @test_smull_high_s16_bitcasta2_wrongindex(<4 x i32> %a, <8 x i1 ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcasta2_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 2> %s1 = bitcast <2 x i32> %s1a to <4 x i16> @@ -244,6 +269,13 @@ define <4 x i32> @test_smull_high_s16_bitcastb2_wrongindex(<8 x i16> %a, <16 x i ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_bitcastb2_wrongindex: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ext v1.16b, v1.16b, v0.16b, #4 +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> @@ -269,6 +301,12 @@ define <4 x i32> @test_smull_high_s16_splata1(<2 x i64> %aa, <8 x i16> %b) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splata1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v1.4h, v0.h[3] +; CHECK-GI-NEXT: ret entry: %a = bitcast <2 x i64> %aa to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -293,6 +331,12 @@ define <4 x i32> @test_smull_high_s16_splatb1(<8 x i16> %a, <16 x i8> %bb) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splatb1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.h[3] +; CHECK-GI-NEXT: ret entry: %b = bitcast <16 x i8> %bb to <8 x i16> %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> @@ -322,6 +366,13 @@ define <4 x i32> @test_smull_high_s16_splata2(<4 x i32> %a, <8 x i16> %b) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splata2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: dup v0.2s, v0.s[3] +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1a = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %s1 = bitcast <2 x i32> %s1a to <4 x i16> @@ -351,6 +402,13 @@ define <4 x i32> @test_smull_high_s16_splatb2(<8 x i16> %a, <16 x i8> %b) #0 { ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_smull_high_s16_splatb2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: dup v1.8b, v1.b[3] +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: ret entry: %s1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %s2a = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> @@ -362,10 +420,10 @@ entry: define <4 x i32> @test_umull_high_s16_bitcasta1(<2 x i64> %aa, <8 x i16> %b) #0 { -; CHECK-LE-LABEL: test_umull_high_s16_bitcasta1: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: umull2 v0.4s, v0.8h, v1.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_umull_high_s16_bitcasta1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_umull_high_s16_bitcasta1: ; CHECK-BE: // %bb.0: // %entry @@ -386,10 +444,10 @@ entry: } define <8 x i16> @test_vabdl_high_u82(<16 x i8> %a, <8 x i16> %bb) { -; CHECK-LE-LABEL: test_vabdl_high_u82: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: uabdl2 v0.8h, v0.16b, v1.16b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_vabdl_high_u82: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uabdl2 v0.8h, v0.16b, v1.16b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_vabdl_high_u82: ; CHECK-BE: // %bb.0: // %entry @@ -411,10 +469,10 @@ entry: } define <8 x i16> @test_vabdl_high_s82(<16 x i8> %a, <8 x i16> %bb) { -; CHECK-LE-LABEL: test_vabdl_high_s82: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: sabdl2 v0.8h, v0.16b, v1.16b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_vabdl_high_s82: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sabdl2 v0.8h, v0.16b, v1.16b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_vabdl_high_s82: ; CHECK-BE: // %bb.0: // %entry @@ -436,10 +494,10 @@ entry: } define <4 x i32> @test_vqdmlal_high_s16_bitcast(<4 x i32> %a, <8 x i16> %b, <16 x i8> %cc) { -; CHECK-LE-LABEL: test_vqdmlal_high_s16_bitcast: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_vqdmlal_high_s16_bitcast: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmlal2 v0.4s, v1.8h, v2.8h +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_vqdmlal_high_s16_bitcast: ; CHECK-BE: // %bb.0: // %entry @@ -463,12 +521,12 @@ entry: } define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) { -; CHECK-LE-LABEL: test_pmull_high_p8_128: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: fmov d0, x3 -; CHECK-LE-NEXT: fmov d1, x1 -; CHECK-LE-NEXT: pmull v0.8h, v1.8b, v0.8b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_pmull_high_p8_128: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, x3 +; CHECK-NEXT: fmov d1, x1 +; CHECK-NEXT: pmull v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_pmull_high_p8_128: ; CHECK-BE: // %bb.0: // %entry @@ -490,10 +548,10 @@ entry: } define <8 x i16> @test_pmull_high_p8_64(<2 x i64> %aa, <2 x i64> %bb) { -; CHECK-LE-LABEL: test_pmull_high_p8_64: -; CHECK-LE: // %bb.0: // %entry -; CHECK-LE-NEXT: pmull2 v0.8h, v0.16b, v1.16b -; CHECK-LE-NEXT: ret +; CHECK-LABEL: test_pmull_high_p8_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: pmull2 v0.8h, v0.16b, v1.16b +; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_pmull_high_p8_64: ; CHECK-BE: // %bb.0: // %entry @@ -532,6 +590,14 @@ define <8 x i16> @foov8i16(<16 x i8> %a1, <2 x i64> %b1) { ; CHECK-BE-NEXT: rev64 v0.8h, v0.8h ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: foov8i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #5 +; CHECK-GI-NEXT: shrn v0.4h, v0.4s, #5 +; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ret %a0 = bitcast <16 x i8> %a1 to <4 x i32> %b0 = bitcast <2 x i64> %b1 to <4 x i32> %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5> @@ -558,6 +624,12 @@ define <2 x i64> @hadd32_zext_asr(<16 x i8> %src1a) { ; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #1 ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: hadd32_zext_asr: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d0, v0.d[1] +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #1 +; CHECK-GI-NEXT: ret %src1 = bitcast <16 x i8> %src1a to <4 x i32> %s1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %zextsrc1 = zext <2 x i32> %s1 to <2 x i64> @@ -580,6 +652,12 @@ define <2 x i64> @test_umull_high_s16_splata1(<2 x i64> %aa, <4 x i32> %b) #0 { ; CHECK-BE-NEXT: umull2 v0.2d, v1.4s, v0.s[1] ; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret +; +; CHECK-GI-LABEL: test_umull_high_s16_splata1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov d1, v1.d[1] +; CHECK-GI-NEXT: umull v0.2d, v1.2s, v0.s[1] +; CHECK-GI-NEXT: ret entry: %a = bitcast <2 x i64> %aa to <4 x i32> %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 1, i32 1> diff --git a/llvm/test/CodeGen/AArch64/neon-saba.ll b/llvm/test/CodeGen/AArch64/neon-saba.ll index 78ccc89..19967bd 100644 --- a/llvm/test/CodeGen/AArch64/neon-saba.ll +++ b/llvm/test/CodeGen/AArch64/neon-saba.ll @@ -1,13 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; SABA from ADD(ABS(SUB NSW)) define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { -; CHECK-LABEL: saba_abs_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_4s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_4s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: abs v1.4s, v1.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ret %sub = sub nsw <4 x i32> %b, %c %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) %add = add <4 x i32> %a, %abs @@ -15,10 +23,17 @@ define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 { } define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { -; CHECK-LABEL: saba_abs_2s: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_2s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_2s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: abs v1.2s, v1.2s +; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: ret %sub = sub nsw <2 x i32> %b, %c %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true) %add = add <2 x i32> %a, %abs @@ -26,10 +41,17 @@ define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 { } define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { -; CHECK-LABEL: saba_abs_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_8h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_8h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: abs v1.8h, v1.8h +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret %sub = sub nsw <8 x i16> %b, %c %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) %add = add <8 x i16> %a, %abs @@ -37,10 +59,17 @@ define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 { } define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { -; CHECK-LABEL: saba_abs_4h: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_4h: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.4h, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_4h: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: abs v1.4h, v1.4h +; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret %sub = sub nsw <4 x i16> %b, %c %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true) %add = add <4 x i16> %a, %abs @@ -48,10 +77,17 @@ define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 { } define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { -; CHECK-LABEL: saba_abs_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_16b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_16b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: abs v1.16b, v1.16b +; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-GI-NEXT: ret %sub = sub nsw <16 x i8> %b, %c %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true) %add = add <16 x i8> %a, %abs @@ -59,10 +95,17 @@ define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 { } define <8 x i8> @saba_abs_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 { -; CHECK-LABEL: saba_abs_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: saba_abs_8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: saba_abs_8b: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sub v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: abs v1.8b, v1.8b +; CHECK-GI-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: ret %sub = sub nsw <8 x i8> %b, %c %abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %sub, i1 true) %add = add <8 x i8> %a, %abs diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index b124042..c57383a 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -52,7 +52,6 @@ define i8 @si8_100(i8 %a, i8 %b) { ; CHECK-GI-NEXT: sxtb w8, w0 ; CHECK-GI-NEXT: mov w9, #41 // =0x29 ; CHECK-GI-NEXT: mul w8, w8, w9 -; CHECK-GI-NEXT: sxth w8, w8 ; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: asr w8, w8, #4 ; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 diff --git a/llvm/test/CodeGen/AArch64/stack-tagging.ll b/llvm/test/CodeGen/AArch64/stack-tagging.ll index 8759fb1..5d73c7b 100644 --- a/llvm/test/CodeGen/AArch64/stack-tagging.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging.ll @@ -143,54 +143,4 @@ l: ; CHECK-NOT: @llvm.aarch64.irg.sp ; CHECK: ret void -; If we can't trace one of the lifetime markers to a single alloca, fall back -; to poisoning all allocas at the beginning of the function. -; Each alloca must be poisoned only once. -define void @UnrecognizedLifetime(i8 %v) sanitize_memtag { -entry: - %x = alloca i32, align 4 - %y = alloca i32, align 4 - %z = alloca i32, align 4 - %tobool = icmp eq i8 %v, 0 - %xy = select i1 %tobool, ptr %x, ptr %y - %cxcy = select i1 %tobool, ptr %x, ptr %y - br label %another_bb - -another_bb: - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z) - store i32 7, ptr %z - call void @noUse32(ptr %z) - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z) - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %z) - store i32 7, ptr %z - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %z) - call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cxcy) - store i32 8, ptr %xy - call void @noUse32(ptr %x) - call void @noUse32(ptr %y) - call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cxcy) - ret void -} - -; CHECK-LABEL: define void @UnrecognizedLifetime( -; CHECK: call ptr @llvm.aarch64.irg.sp(i64 0) -; CHECK: alloca { i32, [12 x i8] }, align 16 -; CHECK: call ptr @llvm.aarch64.tagp -; CHECK: call void @llvm.aarch64.settag( -; CHECK: alloca { i32, [12 x i8] }, align 16 -; CHECK: call ptr @llvm.aarch64.tagp -; CHECK: call void @llvm.aarch64.settag( -; CHECK: alloca { i32, [12 x i8] }, align 16 -; CHECK: call ptr @llvm.aarch64.tagp -; CHECK: call void @llvm.aarch64.settag( -; CHECK: store i32 -; CHECK: call void @noUse32(ptr -; CHECK: store i32 -; CHECK: store i32 -; CHECK: call void @noUse32(ptr -; CHECK: call void @llvm.aarch64.settag( -; CHECK: call void @llvm.aarch64.settag( -; CHECK: call void @llvm.aarch64.settag( -; CHECK: ret void - !0 = !{} diff --git a/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll b/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll deleted file mode 100644 index 18b8aab..0000000 --- a/llvm/test/CodeGen/AArch64/wineh-reuse-catch-alloca.ll +++ /dev/null @@ -1,100 +0,0 @@ -; RUN: llc %s --mtriple=aarch64-pc-windows-msvc -o - | FileCheck %s - -; Tests the fixed object layouts when two catchpads re-use the same stack -; allocation for this catch objects. - -; Generated from this C++ code, with modifications to the IR (see comments in -; IR): -; https://godbolt.org/z/9qv5Yn68j -; > clang --target=aarch64-pc-windows-msvc test.cpp -; ``` -; extern "C" void boom(); -; extern "C" int calls_boom(); -; { -; try { boom(); } -; catch (int& i) { return i; } -; catch (long& l) { return l; } -; return 0; -; } -; ``` - -; Only need 48 bytes on the stack, not 64. -; CHECK-LABEL: calls_boom: -; CHECK: sub sp, sp, #48 -; CHECK: .seh_stackalloc 48 - -; Both the catch blocks load from the same address. -; CHECK-LABEL: "?catch$3@?0?calls_boom@4HA": -; CHECK: ldr x8, [x29, #24] -; CHECK-LABEL: "?catch$4@?0?calls_boom@4HA": -; CHECK: ldr x8, [x29, #24] - -; There's enough space for the UnwindHelp to be at -16 instead of -32 -; CHECK-LABEL: $cppxdata$calls_boom: -; CHECK: .word -16 // UnwindHelp - -; Both catches have the same object offset. -; CHECK-LABEL: $handlerMap$0$calls_boom: -; CHECK: .word -8 // CatchObjOffset -; CHECK-NEXT: .word "?catch$3@?0?calls_boom@4HA"@IMGREL // Handler -; CHECK: .word -8 // CatchObjOffset -; CHECK-NEXT: .word "?catch$4@?0?calls_boom@4HA"@IMGREL // Handler - -%rtti.TypeDescriptor2 = type { ptr, ptr, [3 x i8] } - -$"??_R0H@8" = comdat any - -$"??_R0J@8" = comdat any - -@"??_7type_info@@6B@" = external constant ptr -@"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { ptr @"??_7type_info@@6B@", ptr null, [3 x i8] c".H\00" }, comdat -@"??_R0J@8" = linkonce_odr global %rtti.TypeDescriptor2 { ptr @"??_7type_info@@6B@", ptr null, [3 x i8] c".J\00" }, comdat - -define dso_local i32 @calls_boom() personality ptr @__CxxFrameHandler3 { -entry: - %retval = alloca i32, align 4 -; MODIFICATION: Remove unusued alloca -; %l = alloca ptr, align 8 - %i = alloca ptr, align 8 - invoke void @boom() - to label %invoke.cont unwind label %catch.dispatch - -catch.dispatch: - %0 = catchswitch within none [label %catch1, label %catch] unwind to caller - -catch1: - %1 = catchpad within %0 [ptr @"??_R0H@8", i32 8, ptr %i] - %2 = load ptr, ptr %i, align 8 - %3 = load i32, ptr %2, align 4 - store i32 %3, ptr %retval, align 4 - catchret from %1 to label %catchret.dest2 - -catch: -; MODIFICATION: Use %i instead of %l - %4 = catchpad within %0 [ptr @"??_R0J@8", i32 8, ptr %i] - %5 = load ptr, ptr %i, align 8 - %6 = load i32, ptr %5, align 4 - store i32 %6, ptr %retval, align 4 - catchret from %4 to label %catchret.dest - -invoke.cont: - br label %try.cont - -catchret.dest: - br label %return - -catchret.dest2: - br label %return - -try.cont: - store i32 0, ptr %retval, align 4 - br label %return - -return: - %7 = load i32, ptr %retval, align 4 - ret i32 %7 -} - -declare dso_local void @boom() #1 - -declare dso_local i32 @__CxxFrameHandler3(...) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 9b35920..fa4676e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3211,7 +3211,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_ieee_mode = 1 ; GFX10-NEXT: enable_wgp_mode = 1 ; GFX10-NEXT: enable_mem_ordered = 1 -; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_fwd_progress = 1 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 @@ -3303,7 +3303,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: enable_ieee_mode = 1 ; GFX11-NEXT: enable_wgp_mode = 1 ; GFX11-NEXT: enable_mem_ordered = 1 -; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_fwd_progress = 1 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 @@ -4215,7 +4215,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_ieee_mode = 1 ; GFX10-NEXT: enable_wgp_mode = 1 ; GFX10-NEXT: enable_mem_ordered = 1 -; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_fwd_progress = 1 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 @@ -4300,7 +4300,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_ieee_mode = 1 ; GFX11-NEXT: enable_wgp_mode = 1 ; GFX11-NEXT: enable_mem_ordered = 1 -; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_fwd_progress = 1 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 @@ -4569,7 +4569,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_ieee_mode = 1 ; GFX10-NEXT: enable_wgp_mode = 1 ; GFX10-NEXT: enable_mem_ordered = 1 -; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_fwd_progress = 1 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 @@ -4657,7 +4657,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_ieee_mode = 1 ; GFX11-NEXT: enable_wgp_mode = 1 ; GFX11-NEXT: enable_mem_ordered = 1 -; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_fwd_progress = 1 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir index 5b8c284..dde566d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s --- name: bswap_i32_vv @@ -19,6 +21,7 @@ body: | ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16711935 ; GFX7-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 [[S_MOV_B32_]], [[V_ALIGNBIT_B32_e64_1]], [[V_ALIGNBIT_B32_e64_]], implicit $exec ; GFX7-NEXT: S_ENDPGM 0, implicit [[V_BFI_B32_e64_]] + ; ; GFX8-LABEL: name: bswap_i32_vv ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} @@ -26,6 +29,22 @@ body: | ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 ; GFX8-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] + ; + ; GFX9-LABEL: name: bswap_i32_vv + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX9-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] + ; + ; GFX10-LABEL: name: bswap_i32_vv + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX10-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_BSWAP %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir index 0a4cb3cc..fa95f33 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s --- @@ -24,6 +24,24 @@ body: | ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]] ; + ; GFX9-LABEL: name: fshr_s32 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] + ; + ; GFX10-LABEL: name: fshr_s32 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] + ; ; GFX11-LABEL: name: fshr_s32 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir index be3fe91..4f5f52b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memcpy_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memcpy_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMCPY %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir index a82ca30..0392aef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memcpyinline_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memcpyinline_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMCPY_INLINE %2:_(p0), %5:_(p0), %7:_(s64) :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir index e7cfaab..1f8d1aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memmove_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memmove_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMMOVE %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir index 021cebb..dda94e15 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir @@ -30,3 +30,32 @@ body: | S_ENDPGM 0 ... +--- +name: memset_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: memset_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8) + ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s16) = G_TRUNC %3:_(s32) + %5:_(s8) = G_TRUNC %4:_(s16) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMSET %2:_(p0), %5:_(s8), %7:_(s64), 0 :: (volatile store (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir index cd69104..69e3561 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir @@ -80,8 +80,7 @@ body: | ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG]], [[SEXT_INREG1]] ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32) - ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 - ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[ASHR]](s32) ; ; GFX9-LABEL: name: test_smulh_s16 ; GFX9: liveins: $vgpr0, $vgpr1 @@ -93,8 +92,7 @@ body: | ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG]], [[SEXT_INREG1]] ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32) - ; GFX9-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 - ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[ASHR]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -200,9 +198,7 @@ body: | ; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 16 ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG2]], [[SEXT_INREG3]] ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[MUL1]], [[C]](s32) - ; GFX9-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 - ; GFX9-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR1]], 16 - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG4]](s32), [[SEXT_INREG5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ASHR]](s32), [[ASHR1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir index 2c545c8..1025d60 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir @@ -92,8 +92,7 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GCN-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32) - ; GCN-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 20 - ; GCN-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[ASHR]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 16 %2:_(s32) = G_ASHR %0, %1(s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir new file mode 100644 index 0000000..beca901 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir @@ -0,0 +1,40 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +--- +name: basic_test +legalized: true +machineFunctionInfo: + isWholeWaveFunction: true +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: basic_test + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:vcc(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[COPY3]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0 + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s32) = COPY $vgpr1 + %0:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + %12:_(s32) = G_CONSTANT i32 5 + %11:_(s32) = G_SELECT %0(s1), %1, %12 + %14:_(s32) = G_CONSTANT i32 3 + %13:_(s32) = G_SELECT %0(s1), %2, %14 + %15:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), %11(s32), %13(s32), 1, 1, 1, 0 + $vgpr0 = COPY %15(s32) + G_AMDGPU_WHOLE_WAVE_FUNC_RETURN %0(s1), implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll index d4826a2..6044f6e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll @@ -7,7 +7,7 @@ ; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}} ; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf02c0{{$}} ; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}} -; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x600f0000{{$}} +; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xe00f0000{{$}} define amdgpu_cs half @cs_amdpal(half %arg0) #0 { %add = fadd half %arg0, 1.0 ret half %add diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll index 7ce5a00..d91b2117 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll @@ -514,9 +514,9 @@ define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, pt ret void } -define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val, i32 %offset) #0 { +define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 { ; CHECK-LABEL: define internal void @callee_alias_addr_space_branch( -; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1]] { +; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: br i1 [[COND1]], label %[[BB_1_TRUE:.*]], label %[[BB_1_FALSE:.*]] ; CHECK: [[BB_1_TRUE]]: ; CHECK-NEXT: br label %[[BB_1_END:.*]] diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 029604c..b49614d 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -2,6 +2,27 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s ; TODO: Add global-isel when it can support bf16 +define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) { +; GCN-LABEL: llvm_sqrt_bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_sqrt_bf16_e32 v2, v2 +; GCN-NEXT: global_store_b16 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src) + store bfloat %sqrt, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) { +; GCN-LABEL: llvm_sqrt_bf16_s: +; GCN: ; %bb.0: +; GCN-NEXT: v_sqrt_bf16_e32 v2, s0 +; GCN-NEXT: global_store_b16 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src) + store bfloat %sqrt, ptr addrspace(1) %out, align 2 + ret void +} define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) { ; GCN-LABEL: llvm_log2_bf16_v: @@ -47,5 +68,6 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src ret void } +declare bfloat @llvm.sqrt.bf16(bfloat) declare bfloat @llvm.log2.bf16(bfloat) declare bfloat @llvm.exp2.bf16(bfloat) diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index cd6d741..7859fcdf 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN ; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9,GFX900 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 | FileCheck %s -check-prefixes=GFX9,GFX950 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 @@ -967,12 +968,21 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_store_global_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_store_global_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_store_dword v[1:2], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_store_global_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: global_store_dword v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_store_global_v2bf16: ; GFX10: ; %bb.0: @@ -2019,23 +2029,41 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_store_global_v64bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_store_global_v64bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[0:3], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_store_global_v64bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[0:3], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_store_global_v64bf16: ; GFX10: ; %bb.0: @@ -2204,20 +2232,30 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_load_store_f32_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_load_store_f32_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v0, v[0:1], off +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX900-NEXT: global_store_short_d16_hi v[2:3], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_load_store_f32_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v0, v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: global_store_short v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_load_store_f32_to_bf16: ; GFX10: ; %bb.0: @@ -2308,30 +2346,50 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_load_store_f64_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] -; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| -; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_add3_u32 v4, v5, v4, s8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc -; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_load_store_f64_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] +; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX900-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 +; GFX900-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| +; GFX900-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] +; GFX900-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX900-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_add3_u32 v4, v5, v4, s8 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc +; GFX900-NEXT: global_store_short_d16_hi v[2:3], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_load_store_f64_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| +; GFX950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 +; GFX950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] +; GFX950-NEXT: v_add_u32_e32 v0, v6, v0 +; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: global_store_short v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_load_store_f64_to_bf16: ; GFX10: ; %bb.0: @@ -2858,12 +2916,21 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_arg_store: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_short v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_arg_store: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_store_short v[1:2], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_arg_store: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: global_store_short v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_arg_store: ; GFX10: ; %bb.0: @@ -2918,12 +2985,21 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_arg_store_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_arg_store_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_store_dword v[1:2], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_arg_store_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: global_store_dword v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_arg_store_v2bf16: ; GFX10: ; %bb.0: @@ -3384,12 +3460,19 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_byval: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_byval: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_byval: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_short off, v0, s32 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_byval: ; GFX10: ; %bb.0: @@ -3440,12 +3523,19 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_sret: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_sret: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_sret: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_short v0, v1, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_sret: ; GFX10: ; %bb.0: @@ -3907,34 +3997,63 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v2, s30, 0 -; GFX9-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v2, 1 -; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v2, s30, 0 +; GFX900-NEXT: v_writelane_b32 v2, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v2, 1 +; GFX900-NEXT: v_readlane_b32 s30, v2, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v4, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v4, s30, 0 +; GFX950-NEXT: v_writelane_b32 v4, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_short v1, v0, off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v4, 1 +; GFX950-NEXT: v_readlane_b32 s30, v4, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call: ; GFX10: ; %bb.0: ; %entry @@ -4104,34 +4223,63 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v2bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v2, s30, 0 -; GFX9-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v2, 1 -; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v2bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v2, s30, 0 +; GFX900-NEXT: v_writelane_b32 v2, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v2, 1 +; GFX900-NEXT: v_readlane_b32 s30, v2, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v2bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v4, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v4, s30, 0 +; GFX950-NEXT: v_writelane_b32 v4, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dword v1, v0, off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v4, 1 +; GFX950-NEXT: v_readlane_b32 s30, v4, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v2bf16: ; GFX10: ; %bb.0: ; %entry @@ -4308,36 +4456,68 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v3bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v3, s30, 0 -; GFX9-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v3bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v3, s30, 0 +; GFX900-NEXT: v_writelane_b32 v3, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v3, 1 +; GFX900-NEXT: v_readlane_b32 s30, v3, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v3bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v5, s30, 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 +; GFX950-NEXT: v_mov_b32_e32 v4, v2 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_short v4, v1, off offset:4 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: scratch_store_dword v4, v0, off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v5, 1 +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v3bf16: ; GFX10: ; %bb.0: ; %entry @@ -4534,36 +4714,66 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v4bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v3, s30, 0 -; GFX9-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v4bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v3, s30, 0 +; GFX900-NEXT: v_writelane_b32 v3, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v3, 1 +; GFX900-NEXT: v_readlane_b32 s30, v3, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v4bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v5, s30, 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 +; GFX950-NEXT: v_mov_b32_e32 v4, v2 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v5, 1 +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v4bf16: ; GFX10: ; %bb.0: ; %entry @@ -4804,40 +5014,69 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v8bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v5, s30, 0 -; GFX9-NEXT: v_writelane_b32 v5, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v5, 1 -; GFX9-NEXT: v_readlane_b32 s30, v5, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v8bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v5, s30, 0 +; GFX900-NEXT: v_writelane_b32 v5, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v5, 1 +; GFX900-NEXT: v_readlane_b32 s30, v5, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v8bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v5, s30, 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dwordx4 v4, v[0:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v5, 1 +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v8bf16: ; GFX10: ; %bb.0: ; %entry @@ -5174,48 +5413,79 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v16bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v9, s30, 0 -; GFX9-NEXT: v_writelane_b32 v9, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v9, 1 -; GFX9-NEXT: v_readlane_b32 s30, v9, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v16bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v9, s30, 0 +; GFX900-NEXT: v_writelane_b32 v9, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v9, 1 +; GFX900-NEXT: v_readlane_b32 s30, v9, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v16bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v9, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v9, s30, 0 +; GFX950-NEXT: v_writelane_b32 v9, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: scratch_store_dwordx4 v8, v[0:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v9, 1 +; GFX950-NEXT: v_readlane_b32 s30, v9, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v9, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v16bf16: ; GFX10: ; %bb.0: ; %entry @@ -5332,14 +5602,23 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_alloca_load_store_ret: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_alloca_load_store_ret: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_alloca_load_store_ret: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_short off, v0, s32 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: scratch_load_ushort v0, off, s32 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_alloca_load_store_ret: ; GFX10: ; %bb.0: ; %entry @@ -5625,52 +5904,72 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_overflow_stack: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_overflow_stack: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 +; GFX900-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 +; GFX900-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 +; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 +; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 +; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 +; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 +; GFX900-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; GFX900-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(25) +; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124 +; GFX900-NEXT: s_waitcnt vmcnt(25) +; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 +; GFX900-NEXT: s_waitcnt vmcnt(25) +; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116 +; GFX900-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_overflow_stack: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:96 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:80 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:64 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off +; GFX950-NEXT: s_waitcnt vmcnt(7) +; GFX950-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:112 +; GFX950-NEXT: scratch_store_short v0, v1, off offset:128 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_overflow_stack: ; GFX10: ; %bb.0: @@ -5870,15 +6169,25 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v3bf16_to_v3f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v3bf16_to_v3f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v3bf16_to_v3f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v3bf16_to_v3f32: ; GFX10: ; %bb.0: @@ -6120,18 +6429,31 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v6bf16_to_v6f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx3 v[3:5], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v6bf16_to_v6f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx3 v[3:5], v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v6bf16_to_v6f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx3 v[4:6], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v6bf16_to_v6f32: ; GFX10: ; %bb.0: @@ -6766,16 +7088,27 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v2bf16_to_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v2bf16_to_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v2, v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v2bf16_to_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v0, v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v2bf16_to_v2f64: ; GFX10: ; %bb.0: @@ -6852,18 +7185,31 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v3bf16_to_v3f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v3bf16_to_v3f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v3bf16_to_v3f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v3bf16_to_v3f64: ; GFX10: ; %bb.0: @@ -8476,193 +8822,363 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v32bf16_to_v32f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:62 -; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:60 -; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:58 -; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:56 -; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:54 -; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:52 -; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:50 -; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:48 -; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:46 -; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:44 -; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:42 -; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:40 -; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:38 -; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:36 -; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:34 -; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:32 -; GFX9-NEXT: global_load_ushort v26, v[1:2], off -; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:2 -; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16 -; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18 -; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20 -; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22 -; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24 -; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:30 -; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26 -; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28 -; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4 -; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6 -; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8 -; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10 -; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v12 -; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 -; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v14 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v15 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 -; GFX9-NEXT: s_waitcnt vmcnt(32) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v21 -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v19 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v20 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144 -; GFX9-NEXT: s_waitcnt vmcnt(44) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v2 -; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v2 -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 -; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v10 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v32bf16_to_v32f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_ushort v9, v[1:2], off offset:62 +; GFX900-NEXT: global_load_ushort v11, v[1:2], off offset:60 +; GFX900-NEXT: global_load_ushort v12, v[1:2], off offset:58 +; GFX900-NEXT: global_load_ushort v13, v[1:2], off offset:56 +; GFX900-NEXT: global_load_ushort v14, v[1:2], off offset:54 +; GFX900-NEXT: global_load_ushort v15, v[1:2], off offset:52 +; GFX900-NEXT: global_load_ushort v16, v[1:2], off offset:50 +; GFX900-NEXT: global_load_ushort v17, v[1:2], off offset:48 +; GFX900-NEXT: global_load_ushort v18, v[1:2], off offset:46 +; GFX900-NEXT: global_load_ushort v19, v[1:2], off offset:44 +; GFX900-NEXT: global_load_ushort v20, v[1:2], off offset:42 +; GFX900-NEXT: global_load_ushort v21, v[1:2], off offset:40 +; GFX900-NEXT: global_load_ushort v22, v[1:2], off offset:38 +; GFX900-NEXT: global_load_ushort v23, v[1:2], off offset:36 +; GFX900-NEXT: global_load_ushort v24, v[1:2], off offset:34 +; GFX900-NEXT: global_load_ushort v25, v[1:2], off offset:32 +; GFX900-NEXT: global_load_ushort v26, v[1:2], off +; GFX900-NEXT: global_load_ushort v27, v[1:2], off offset:2 +; GFX900-NEXT: global_load_ushort v3, v[1:2], off offset:16 +; GFX900-NEXT: global_load_ushort v4, v[1:2], off offset:18 +; GFX900-NEXT: global_load_ushort v5, v[1:2], off offset:20 +; GFX900-NEXT: global_load_ushort v6, v[1:2], off offset:22 +; GFX900-NEXT: global_load_ushort v8, v[1:2], off offset:24 +; GFX900-NEXT: global_load_ushort v28, v[1:2], off offset:30 +; GFX900-NEXT: global_load_ushort v29, v[1:2], off offset:26 +; GFX900-NEXT: global_load_ushort v30, v[1:2], off offset:28 +; GFX900-NEXT: global_load_ushort v31, v[1:2], off offset:4 +; GFX900-NEXT: global_load_ushort v32, v[1:2], off offset:6 +; GFX900-NEXT: global_load_ushort v33, v[1:2], off offset:8 +; GFX900-NEXT: global_load_ushort v34, v[1:2], off offset:10 +; GFX900-NEXT: global_load_ushort v7, v[1:2], off offset:12 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_ushort v1, v[1:2], off offset:14 +; GFX900-NEXT: s_waitcnt vmcnt(31) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; GFX900-NEXT: s_waitcnt vmcnt(28) +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GFX900-NEXT: s_waitcnt vmcnt(29) +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v12 +; GFX900-NEXT: s_waitcnt vmcnt(31) +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GFX900-NEXT: s_waitcnt vmcnt(31) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v14 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v15 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 +; GFX900-NEXT: s_waitcnt vmcnt(32) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; GFX900-NEXT: s_waitcnt vmcnt(28) +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v21 +; GFX900-NEXT: s_waitcnt vmcnt(33) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v19 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v20 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144 +; GFX900-NEXT: s_waitcnt vmcnt(44) +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 +; GFX900-NEXT: s_waitcnt vmcnt(38) +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GFX900-NEXT: s_waitcnt vmcnt(38) +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 +; GFX900-NEXT: s_waitcnt vmcnt(41) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v2 +; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v2 +; GFX900-NEXT: s_waitcnt vmcnt(41) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 +; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 +; GFX900-NEXT: s_waitcnt vmcnt(41) +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 +; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 +; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v10 +; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 +; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 +; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 +; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v32bf16_to_v32f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse +; GFX950-NEXT: global_load_ushort v1, v[2:3], off offset:2 +; GFX950-NEXT: global_load_ushort v4, v[2:3], off offset:12 +; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:8 +; GFX950-NEXT: global_load_ushort v6, v[2:3], off offset:4 +; GFX950-NEXT: global_load_ushort v7, v[2:3], off +; GFX950-NEXT: global_load_ushort v8, v[2:3], off offset:6 +; GFX950-NEXT: global_load_ushort v9, v[2:3], off offset:10 +; GFX950-NEXT: global_load_ushort v10, v[2:3], off offset:14 +; GFX950-NEXT: global_load_ushort v11, v[2:3], off offset:18 +; GFX950-NEXT: global_load_ushort v12, v[2:3], off offset:28 +; GFX950-NEXT: global_load_ushort v13, v[2:3], off offset:24 +; GFX950-NEXT: global_load_ushort v14, v[2:3], off offset:20 +; GFX950-NEXT: global_load_ushort v15, v[2:3], off offset:16 +; GFX950-NEXT: global_load_ushort v16, v[2:3], off offset:22 +; GFX950-NEXT: global_load_ushort v17, v[2:3], off offset:26 +; GFX950-NEXT: global_load_ushort v18, v[2:3], off offset:30 +; GFX950-NEXT: global_load_ushort v19, v[2:3], off offset:34 +; GFX950-NEXT: global_load_ushort v20, v[2:3], off offset:44 +; GFX950-NEXT: global_load_ushort v21, v[2:3], off offset:40 +; GFX950-NEXT: global_load_ushort v22, v[2:3], off offset:36 +; GFX950-NEXT: global_load_ushort v23, v[2:3], off offset:32 +; GFX950-NEXT: global_load_ushort v24, v[2:3], off offset:38 +; GFX950-NEXT: global_load_ushort v25, v[2:3], off offset:42 +; GFX950-NEXT: global_load_ushort v26, v[2:3], off offset:46 +; GFX950-NEXT: global_load_ushort v42, v[2:3], off offset:50 +; GFX950-NEXT: global_load_ushort v43, v[2:3], off offset:62 +; GFX950-NEXT: global_load_ushort v46, v[2:3], off offset:60 +; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:56 +; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:52 +; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:48 +; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:54 +; GFX950-NEXT: global_load_ushort v58, v[2:3], off offset:58 +; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(31) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: s_waitcnt vmcnt(30) +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; GFX950-NEXT: s_waitcnt vmcnt(29) +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v5 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 +; GFX950-NEXT: s_waitcnt vmcnt(27) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX950-NEXT: s_waitcnt vmcnt(26) +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v9 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; GFX950-NEXT: s_waitcnt vmcnt(23) +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v11 +; GFX950-NEXT: s_waitcnt vmcnt(22) +; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX950-NEXT: s_waitcnt vmcnt(21) +; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX950-NEXT: s_waitcnt vmcnt(20) +; GFX950-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX950-NEXT: s_waitcnt vmcnt(19) +; GFX950-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX950-NEXT: s_waitcnt vmcnt(18) +; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v27 +; GFX950-NEXT: s_waitcnt vmcnt(16) +; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v18 +; GFX950-NEXT: s_waitcnt vmcnt(15) +; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v19 +; GFX950-NEXT: s_waitcnt vmcnt(14) +; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v20 +; GFX950-NEXT: s_waitcnt vmcnt(13) +; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v21 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[14:15], v30 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v31 +; GFX950-NEXT: s_waitcnt vmcnt(10) +; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v24 +; GFX950-NEXT: s_waitcnt vmcnt(9) +; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; GFX950-NEXT: s_waitcnt vmcnt(8) +; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; GFX950-NEXT: s_waitcnt vmcnt(7) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v43 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v32 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v33 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v36 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[32:33], v37 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[30:31], v38 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[36:37], v39 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[38:39], v44 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v42 +; GFX950-NEXT: s_waitcnt vmcnt(5) +; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v46 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v42 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v58 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:240 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v46 +; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v47 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v46 +; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v22 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:224 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v29 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v34 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v35 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[34:35], v48 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[50:51], v49 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[48:49], v52 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[54:55], v53 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[52:53], v40 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[40:41], v41 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v1 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[8:9], v7 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:208 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:192 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[38:41], off offset:176 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[52:55], off offset:160 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[48:51], off offset:144 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[34:37], off offset:128 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:112 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:96 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:80 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:64 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off +; GFX950-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v32bf16_to_v32f64: ; GFX10: ; %bb.0: @@ -9050,20 +9566,29 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_bf16: ; GFX10: ; %bb.0: @@ -9178,29 +9703,41 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v2bf16: ; GFX10: ; %bb.0: @@ -9363,38 +9900,54 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v3bf16: ; GFX10: ; %bb.0: @@ -9604,46 +10157,65 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v4bf16: ; GFX10: ; %bb.0: @@ -9967,80 +10539,113 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v8bf16: ; GFX10: ; %bb.0: @@ -10656,148 +11261,209 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_add_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v16bf16: ; GFX10: ; %bb.0: @@ -12112,286 +12778,407 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_add_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_add_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_add_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_add_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_add_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_add_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_add_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_add_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_add_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_add_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_add_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_add_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_add_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_add_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_add_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_add_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_add_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_add_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_add_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_add_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_add_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_add_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_add_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_add_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_add_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_add_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_add_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_add_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v32bf16: ; GFX10: ; %bb.0: @@ -13290,19 +14077,27 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_bf16_fpimm_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_bf16_fpimm_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_bf16_fpimm_0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_bf16_fpimm_0: ; GFX10: ; %bb.0: @@ -13386,19 +14181,27 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_bf16_fpimm_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_bf16_fpimm_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_bf16_fpimm_1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_bf16_fpimm_1: ; GFX10: ; %bb.0: @@ -13487,20 +14290,29 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_bf16: ; GFX10: ; %bb.0: @@ -13615,29 +14427,41 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_v2bf16: ; GFX10: ; %bb.0: @@ -13800,38 +14624,54 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_v3bf16: ; GFX10: ; %bb.0: @@ -14041,46 +14881,65 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_v4bf16: ; GFX10: ; %bb.0: @@ -14249,20 +15108,29 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_bf16: ; GFX10: ; %bb.0: @@ -14377,29 +15245,41 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v2bf16: ; GFX10: ; %bb.0: @@ -14562,38 +15442,54 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v3bf16: ; GFX10: ; %bb.0: @@ -14803,46 +15699,65 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v4bf16: ; GFX10: ; %bb.0: @@ -15166,80 +16081,113 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v8bf16: ; GFX10: ; %bb.0: @@ -15855,148 +16803,209 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_mul_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_mul_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_mul_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_mul_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_mul_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v16bf16: ; GFX10: ; %bb.0: @@ -17311,286 +18320,407 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_mul_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_mul_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_mul_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_mul_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_mul_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_mul_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_mul_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_mul_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_mul_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_mul_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_mul_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_mul_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_mul_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_mul_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_mul_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_mul_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_mul_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_mul_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_mul_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_mul_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_mul_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_mul_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_mul_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_mul_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_mul_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_mul_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_mul_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_mul_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_mul_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_mul_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_mul_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_mul_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_mul_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_mul_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v32bf16: ; GFX10: ; %bb.0: @@ -18524,30 +19654,50 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_rcp_f32_e32 v4, v2 -; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX9-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX9-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fdiv_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX900-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_rcp_f32_e32 v4, v2 +; GFX900-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX900-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX900-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX900-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX900-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX900-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX900-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX900-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fdiv_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; GFX950-NEXT: v_rcp_f32_e32 v3, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX950-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX950-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX950-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX950-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX950-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX950-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_bf16: ; GFX10: ; %bb.0: @@ -18996,20 +20146,29 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_bf16: ; GFX10: ; %bb.0: @@ -19124,29 +20283,41 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v2bf16: ; GFX10: ; %bb.0: @@ -19309,38 +20480,54 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v3bf16: ; GFX10: ; %bb.0: @@ -19550,46 +20737,65 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v4bf16: ; GFX10: ; %bb.0: @@ -19913,80 +21119,113 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_min_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_min_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v8bf16: ; GFX10: ; %bb.0: @@ -20602,148 +21841,209 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_min_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_min_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_min_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_min_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_min_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_min_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v16bf16: ; GFX10: ; %bb.0: @@ -22058,286 +23358,407 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_min_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_min_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_min_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_min_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_min_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_min_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_min_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_min_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_min_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_min_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_min_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_min_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_min_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_min_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_min_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_min_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_min_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_min_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_min_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_min_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_min_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_min_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_min_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_min_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_min_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_min_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_min_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_min_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_min_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_min_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_min_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_min_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_min_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_min_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_min_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_min_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_min_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_min_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_min_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_min_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_min_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_min_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_min_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_min_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_min_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v32bf16: ; GFX10: ; %bb.0: @@ -23250,20 +24671,29 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_bf16: ; GFX10: ; %bb.0: @@ -23378,29 +24808,41 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v2bf16: ; GFX10: ; %bb.0: @@ -23563,38 +25005,54 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v3bf16: ; GFX10: ; %bb.0: @@ -23804,46 +25262,65 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v4bf16: ; GFX10: ; %bb.0: @@ -24167,80 +25644,113 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_max_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_max_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v8bf16: ; GFX10: ; %bb.0: @@ -24856,148 +26366,209 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_max_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_max_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_max_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_max_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_max_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_max_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v16bf16: ; GFX10: ; %bb.0: @@ -26312,286 +27883,407 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_max_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_max_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_max_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_max_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_max_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_max_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_max_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_max_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_max_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_max_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_max_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_max_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_max_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_max_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_max_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_max_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_max_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_max_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_max_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_max_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_max_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_max_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_max_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_max_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_max_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_max_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_max_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_max_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_max_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_max_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_max_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_max_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_max_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_max_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_max_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_max_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_max_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_max_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_max_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_max_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_max_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_max_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_max_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_max_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_max_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v32bf16: ; GFX10: ; %bb.0: @@ -27543,36 +29235,66 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sqrt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xf800000 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_sqrt_f32_e32 v1, v0 -; GFX9-NEXT: v_add_u32_e32 v2, -1, v1 -; GFX9-NEXT: v_fma_f32 v3, -v2, v1, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; GFX9-NEXT: v_fma_f32 v1, -v3, v1, v0 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; GFX9-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 -; GFX9-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sqrt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0xf800000 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: v_sqrt_f32_e32 v1, v0 +; GFX900-NEXT: v_add_u32_e32 v2, -1, v1 +; GFX900-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GFX900-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GFX900-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; GFX900-NEXT: v_fma_f32 v1, -v3, v1, v0 +; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; GFX900-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, 0x260 +; GFX900-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sqrt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0xf800000 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX950-NEXT: v_sqrt_f32_e32 v1, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_add_u32_e32 v2, -1, v1 +; GFX950-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GFX950-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3 +; GFX950-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] +; GFX950-NEXT: v_fma_f32 v1, -v3, v1, v0 +; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] +; GFX950-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, 0x260 +; GFX950-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sqrt_bf16: ; GFX10: ; %bb.0: @@ -27715,19 +29437,27 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_ldexp_bf16_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_ldexp_bf16_i32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_ldexp_bf16_i32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ldexp_bf16_i32: ; GFX10: ; %bb.0: @@ -27820,20 +29550,29 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_frexp_bf16_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_frexp_bf16_i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_frexp_bf16_i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_frexp_bf16_i16: ; GFX10: ; %bb.0: @@ -27979,35 +29718,61 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_log_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x3f317217 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x3377d1cf -; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x7f800000 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_log_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_log_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x3f317217 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX900-NEXT: v_fma_f32 v2, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x3377d1cf +; GFX900-NEXT: v_fma_f32 v2, v0, s4, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_log_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x800000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x3f317217 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_log_f32_e32 v0, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX950-NEXT: v_fma_f32 v2, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x7f800000 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_log_bf16: ; GFX10: ; %bb.0: @@ -28153,26 +29918,42 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_log2_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_log2_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_log_f32_e32 v0, v0 +; GFX900-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_log2_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x800000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_log_f32_e32 v0, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_log2_bf16: ; GFX10: ; %bb.0: @@ -28329,35 +30110,61 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_log10_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a -; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x3284fbcf -; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x7f800000 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x411a209b -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_log10_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_log_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x3e9a209a +; GFX900-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX900-NEXT: v_fma_f32 v2, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x3284fbcf +; GFX900-NEXT: v_fma_f32 v2, v0, s4, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_log10_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x800000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x3e9a209a +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_log_f32_e32 v0, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX950-NEXT: v_fma_f32 v2, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x7f800000 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_log10_bf16: ; GFX10: ; %bb.0: @@ -28541,36 +30348,61 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_exp_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x3fb8aa3b -; GFX9-NEXT: v_rndne_f32_e32 v2, v1 -; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2 -; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x32a5705f -; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0xc2ce8ed0 -; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x42b17218 -; GFX9-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_exp_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x3fb8aa3b +; GFX900-NEXT: v_rndne_f32_e32 v2, v1 +; GFX900-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX900-NEXT: v_fma_f32 v1, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x32a5705f +; GFX900-NEXT: v_fma_f32 v1, v0, s4, v1 +; GFX900-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX900-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0xc2ce8ed0 +; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x42b17218 +; GFX900-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_exp_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x3fb8aa3b +; GFX950-NEXT: v_rndne_f32_e32 v2, v1 +; GFX950-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX950-NEXT: v_fma_f32 v1, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v1, v0, 0x32a5705f, v1 +; GFX950-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX950-NEXT: v_exp_f32_e32 v1, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0xc2ce8ed0 +; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x42b17218 +; GFX950-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_exp_bf16: ; GFX10: ; %bb.0: @@ -28722,27 +30554,43 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_exp2_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_not_b32_e32 v1, 63 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_exp2_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-NEXT: v_not_b32_e32 v1, 63 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_exp2_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX950-NEXT: v_not_b32_e32 v1, 63 +; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_exp_f32_e32 v0, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_exp2_bf16: ; GFX10: ; %bb.0: @@ -28900,36 +30748,61 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_exp10_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x40549a78 -; GFX9-NEXT: v_rndne_f32_e32 v2, v1 -; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2 -; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x33979a37 -; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0xc23369f4 -; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x421a209b -; GFX9-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_exp10_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x40549a78 +; GFX900-NEXT: v_rndne_f32_e32 v2, v1 +; GFX900-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX900-NEXT: v_fma_f32 v1, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x33979a37 +; GFX900-NEXT: v_fma_f32 v1, v0, s4, v1 +; GFX900-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX900-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0xc23369f4 +; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x421a209b +; GFX900-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_exp10_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x40549a78 +; GFX950-NEXT: v_rndne_f32_e32 v2, v1 +; GFX950-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX950-NEXT: v_fma_f32 v1, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v1, v0, 0x33979a37, v1 +; GFX950-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX950-NEXT: v_exp_f32_e32 v1, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0xc23369f4 +; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x421a209b +; GFX950-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_exp10_bf16: ; GFX10: ; %bb.0: @@ -29059,19 +30932,27 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_ceil_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_ceil_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_ceil_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_ceil_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_ceil_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_ceil_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ceil_bf16: ; GFX10: ; %bb.0: @@ -29157,19 +31038,27 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_trunc_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_trunc_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_trunc_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_trunc_bf16: ; GFX10: ; %bb.0: @@ -29255,19 +31144,27 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rint_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_rndne_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_rint_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_rndne_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_rint_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_rndne_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rint_bf16: ; GFX10: ; %bb.0: @@ -29353,19 +31250,27 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_nearbyint_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_rndne_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_nearbyint_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_rndne_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_nearbyint_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_rndne_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_nearbyint_bf16: ; GFX10: ; %bb.0: @@ -29469,25 +31374,40 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_round_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v0 -; GFX9-NEXT: v_sub_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_round_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v1, v0 +; GFX900-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] +; GFX900-NEXT: s_brev_b32 s4, -2 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_round_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v1, v0 +; GFX950-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] +; GFX950-NEXT: s_brev_b32 s0, -2 +; GFX950-NEXT: v_bfi_b32 v0, s0, v2, v0 +; GFX950-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_round_bf16: ; GFX10: ; %bb.0: @@ -29592,19 +31512,27 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_roundeven_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_rndne_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_roundeven_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_rndne_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_roundeven_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_rndne_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_roundeven_bf16: ; GFX10: ; %bb.0: @@ -29690,19 +31618,27 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_floor_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_floor_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_floor_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_floor_bf16: ; GFX10: ; %bb.0: @@ -29786,19 +31722,27 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_canonicalize_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_canonicalize_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_canonicalize_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_canonicalize_bf16: ; GFX10: ; %bb.0: @@ -29929,14 +31873,24 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_oeq_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_oeq_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_oeq_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_oeq_bf16: ; GFX10: ; %bb.0: @@ -30004,14 +31958,24 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ogt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ogt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ogt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ogt_bf16: ; GFX10: ; %bb.0: @@ -30079,14 +32043,24 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_oge_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_oge_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_oge_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_oge_bf16: ; GFX10: ; %bb.0: @@ -30154,14 +32128,24 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_olt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_olt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_olt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_olt_bf16: ; GFX10: ; %bb.0: @@ -30229,14 +32213,24 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ole_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ole_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ole_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ole_bf16: ; GFX10: ; %bb.0: @@ -30304,14 +32298,24 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_one_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_one_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_one_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_one_bf16: ; GFX10: ; %bb.0: @@ -30379,14 +32383,24 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_uno_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_uno_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_uno_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_uno_bf16: ; GFX10: ; %bb.0: @@ -30454,14 +32468,24 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ueq_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ueq_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ueq_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ueq_bf16: ; GFX10: ; %bb.0: @@ -30529,14 +32553,24 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ugt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ugt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ugt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ugt_bf16: ; GFX10: ; %bb.0: @@ -30604,14 +32638,24 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_uge_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_uge_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_uge_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_uge_bf16: ; GFX10: ; %bb.0: @@ -30679,14 +32723,24 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ult_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ult_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ult_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ult_bf16: ; GFX10: ; %bb.0: @@ -30754,14 +32808,24 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ule_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ule_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ule_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ule_bf16: ; GFX10: ; %bb.0: @@ -30829,14 +32893,24 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_une_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_une_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_une_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_une_bf16: ; GFX10: ; %bb.0: @@ -31011,16 +33085,27 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v2bf16_to_v2i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v2bf16_to_v2i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v1, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16: ; GFX10: ; %bb.0: @@ -31110,18 +33195,31 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) { ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v3bf16_to_v3i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v3bf16_to_v3i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v2, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16: ; GFX10: ; %bb.0: @@ -31232,21 +33330,37 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) { ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v4bf16_to_v4i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v4bf16_to_v4i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX950-NEXT: v_perm_b32 v1, v1, v2, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16: ; GFX10: ; %bb.0: @@ -31663,24 +33777,44 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_bf16_to_i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v1, |v0|, s4 -; GFX9-NEXT: v_floor_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0xcf800000 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GFX9-NEXT: v_fma_f32 v1, v1, s4, |v0| -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v3 -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_bf16_to_i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v1, |v0|, s4 +; GFX900-NEXT: v_floor_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0xcf800000 +; GFX900-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX900-NEXT: v_fma_f32 v1, v1, s4, |v0| +; GFX900-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX900-NEXT: v_xor_b32_e32 v2, v2, v3 +; GFX900-NEXT: v_xor_b32_e32 v0, v1, v3 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 +; GFX900-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_bf16_to_i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v1, |v0|, s0 +; GFX950-NEXT: v_floor_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0xcf800000 +; GFX950-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX950-NEXT: v_fma_f32 v1, v1, s0, |v0| +; GFX950-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX950-NEXT: v_xor_b32_e32 v2, v2, v3 +; GFX950-NEXT: v_xor_b32_e32 v0, v1, v3 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_bf16_to_i64: ; GFX10: ; %bb.0: @@ -31845,36 +33979,69 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v2, |v1|, s4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_floor_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX9-NEXT: v_trunc_f32_e32 v4, v0 -; GFX9-NEXT: v_fma_f32 v3, v2, s5, |v1| -; GFX9-NEXT: v_mul_f32_e64 v0, |v4|, s4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_fma_f32 v5, v0, s5, |v4| -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, v5, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, v6, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v2bf16_to_v2i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v2, |v1|, s4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_floor_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s5, 0xcf800000 +; GFX900-NEXT: v_trunc_f32_e32 v4, v0 +; GFX900-NEXT: v_fma_f32 v3, v2, s5, |v1| +; GFX900-NEXT: v_mul_f32_e64 v0, |v4|, s4 +; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX900-NEXT: v_fma_f32 v5, v0, s5, |v4| +; GFX900-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX900-NEXT: v_xor_b32_e32 v3, v3, v1 +; GFX900-NEXT: v_xor_b32_e32 v2, v2, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX900-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX900-NEXT: v_xor_b32_e32 v2, v5, v3 +; GFX900-NEXT: v_xor_b32_e32 v4, v6, v3 +; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v2bf16_to_v2i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v2, |v1|, s0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_floor_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s1, 0xcf800000 +; GFX950-NEXT: v_trunc_f32_e32 v4, v0 +; GFX950-NEXT: v_fma_f32 v3, v2, s1, |v1| +; GFX950-NEXT: v_mul_f32_e64 v0, |v4|, s0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX950-NEXT: v_fma_f32 v5, v0, s1, |v4| +; GFX950-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v3, v1 +; GFX950-NEXT: v_xor_b32_e32 v2, v2, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX950-NEXT: v_xor_b32_e32 v2, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v3 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64: ; GFX10: ; %bb.0: @@ -32082,49 +34249,96 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4 -; GFX9-NEXT: v_floor_f32_e32 v3, v3 -; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2| -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_trunc_f32_e32 v5, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 -; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5| -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX9-NEXT: v_mul_f32_e64 v5, |v1|, s4 -; GFX9-NEXT: v_floor_f32_e32 v5, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, v7, v3 -; GFX9-NEXT: v_fma_f32 v7, v5, s5, |v1| -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, v8, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v4, v7, v1 -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v3bf16_to_v3i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v3, |v2|, s4 +; GFX900-NEXT: v_floor_f32_e32 v3, v3 +; GFX900-NEXT: s_mov_b32 s5, 0xcf800000 +; GFX900-NEXT: v_fma_f32 v4, v3, s5, |v2| +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX900-NEXT: v_trunc_f32_e32 v5, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX900-NEXT: v_mul_f32_e64 v0, |v5|, s4 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX900-NEXT: v_fma_f32 v6, v0, s5, |v5| +; GFX900-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v8, v0 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX900-NEXT: v_trunc_f32_e32 v1, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX900-NEXT: v_mul_f32_e64 v5, |v1|, s4 +; GFX900-NEXT: v_floor_f32_e32 v5, v5 +; GFX900-NEXT: v_xor_b32_e32 v2, v7, v3 +; GFX900-NEXT: v_fma_f32 v7, v5, s5, |v1| +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX900-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX900-NEXT: v_xor_b32_e32 v4, v8, v3 +; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX900-NEXT: v_xor_b32_e32 v4, v7, v1 +; GFX900-NEXT: v_xor_b32_e32 v5, v5, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v3bf16_to_v3i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v3, |v2|, s0 +; GFX950-NEXT: v_floor_f32_e32 v3, v3 +; GFX950-NEXT: s_mov_b32 s1, 0xcf800000 +; GFX950-NEXT: v_fma_f32 v4, v3, s1, |v2| +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX950-NEXT: v_trunc_f32_e32 v5, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX950-NEXT: v_mul_f32_e64 v0, |v5|, s0 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX950-NEXT: v_fma_f32 v6, v0, s1, |v5| +; GFX950-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v8, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX950-NEXT: v_trunc_f32_e32 v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX950-NEXT: v_mul_f32_e64 v5, |v1|, s0 +; GFX950-NEXT: v_floor_f32_e32 v5, v5 +; GFX950-NEXT: v_xor_b32_e32 v2, v7, v3 +; GFX950-NEXT: v_fma_f32 v7, v5, s1, |v1| +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX950-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX950-NEXT: v_xor_b32_e32 v4, v8, v3 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX950-NEXT: v_xor_b32_e32 v4, v7, v1 +; GFX950-NEXT: v_xor_b32_e32 v5, v5, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64: ; GFX10: ; %bb.0: @@ -32393,61 +34607,120 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4 -; GFX9-NEXT: v_floor_f32_e32 v3, v3 -; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2| -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_trunc_f32_e32 v5, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 -; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5| -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, v6, v3 -; GFX9-NEXT: v_mul_f32_e64 v6, |v5|, s4 -; GFX9-NEXT: v_floor_f32_e32 v6, v6 -; GFX9-NEXT: v_xor_b32_e32 v4, v7, v3 -; GFX9-NEXT: v_fma_f32 v7, v6, s5, |v5| -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v4, v7, v5 -; GFX9-NEXT: v_mul_f32_e64 v7, |v1|, s4 -; GFX9-NEXT: v_floor_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_fma_f32 v9, v7, s5, |v1| -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, v6, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v6, v9, v1 -; GFX9-NEXT: v_xor_b32_e32 v7, v7, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v8 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v4bf16_to_v4i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v3, |v2|, s4 +; GFX900-NEXT: v_floor_f32_e32 v3, v3 +; GFX900-NEXT: s_mov_b32 s5, 0xcf800000 +; GFX900-NEXT: v_fma_f32 v4, v3, s5, |v2| +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX900-NEXT: v_trunc_f32_e32 v5, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX900-NEXT: v_mul_f32_e64 v0, |v5|, s4 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX900-NEXT: v_fma_f32 v6, v0, s5, |v5| +; GFX900-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX900-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v0 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX900-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_trunc_f32_e32 v5, v5 +; GFX900-NEXT: v_xor_b32_e32 v2, v6, v3 +; GFX900-NEXT: v_mul_f32_e64 v6, |v5|, s4 +; GFX900-NEXT: v_floor_f32_e32 v6, v6 +; GFX900-NEXT: v_xor_b32_e32 v4, v7, v3 +; GFX900-NEXT: v_fma_f32 v7, v6, s5, |v5| +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX900-NEXT: v_trunc_f32_e32 v1, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX900-NEXT: v_xor_b32_e32 v4, v7, v5 +; GFX900-NEXT: v_mul_f32_e64 v7, |v1|, s4 +; GFX900-NEXT: v_floor_f32_e32 v7, v7 +; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX900-NEXT: v_fma_f32 v9, v7, s5, |v1| +; GFX900-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX900-NEXT: v_xor_b32_e32 v6, v6, v5 +; GFX900-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; GFX900-NEXT: v_xor_b32_e32 v6, v9, v1 +; GFX900-NEXT: v_xor_b32_e32 v7, v7, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, v8 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v4bf16_to_v4i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v3, |v2|, s0 +; GFX950-NEXT: v_floor_f32_e32 v3, v3 +; GFX950-NEXT: s_mov_b32 s1, 0xcf800000 +; GFX950-NEXT: v_fma_f32 v4, v3, s1, |v2| +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX950-NEXT: v_trunc_f32_e32 v5, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX950-NEXT: v_mul_f32_e64 v0, |v5|, s0 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX950-NEXT: v_fma_f32 v6, v0, s1, |v5| +; GFX950-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX950-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_trunc_f32_e32 v5, v5 +; GFX950-NEXT: v_xor_b32_e32 v2, v6, v3 +; GFX950-NEXT: v_mul_f32_e64 v6, |v5|, s0 +; GFX950-NEXT: v_floor_f32_e32 v6, v6 +; GFX950-NEXT: v_xor_b32_e32 v4, v7, v3 +; GFX950-NEXT: v_fma_f32 v7, v6, s1, |v5| +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX950-NEXT: v_trunc_f32_e32 v1, v1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX950-NEXT: v_xor_b32_e32 v4, v7, v5 +; GFX950-NEXT: v_mul_f32_e64 v7, |v1|, s0 +; GFX950-NEXT: v_floor_f32_e32 v7, v7 +; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX950-NEXT: v_fma_f32 v9, v7, s1, |v1| +; GFX950-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX950-NEXT: v_xor_b32_e32 v6, v6, v5 +; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; GFX950-NEXT: v_xor_b32_e32 v6, v9, v1 +; GFX950-NEXT: v_xor_b32_e32 v7, v7, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64: ; GFX10: ; %bb.0: @@ -32594,18 +34867,25 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_i16_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_i16_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_i16_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_i16_to_bf16: ; GFX10: ; %bb.0: @@ -32698,25 +34978,33 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v2i16_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v2i16_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16: ; GFX10: ; %bb.0: @@ -32846,32 +35134,42 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v3i16_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v3i16_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16: ; GFX10: ; %bb.0: @@ -33042,38 +35340,49 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v4i16_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v4i16_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16: ; GFX10: ; %bb.0: @@ -33219,18 +35528,25 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_i32_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_i32_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_i32_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_i32_to_bf16: ; GFX10: ; %bb.0: @@ -33315,25 +35631,33 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v2i32_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v2i32_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16: ; GFX10: ; %bb.0: @@ -33452,32 +35776,42 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v3i32_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v3i32_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16: ; GFX10: ; %bb.0: @@ -33629,38 +35963,49 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v4i32_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v4i32_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX950-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16: ; GFX10: ; %bb.0: @@ -33827,29 +36172,47 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_i64_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, v0, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 -; GFX9-NEXT: v_ffbh_i32_e32 v3, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 32, v2 -; GFX9-NEXT: v_add_u32_e32 v3, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_i64_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v2, v0, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX900-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX900-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX900-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX900-NEXT: v_min_u32_e32 v2, v3, v2 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_i64_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_min_u32_e32 v2, v3, v2 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_i64_to_bf16: ; GFX10: ; %bb.0: @@ -34044,47 +36407,77 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v5, v0, v1 -; GFX9-NEXT: v_ffbh_i32_e32 v4, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX9-NEXT: v_add_u32_e32 v4, -1, v4 -; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 -; GFX9-NEXT: v_ldexp_f32 v4, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v6, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v2i64_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v5, v0, v1 +; GFX900-NEXT: v_ffbh_i32_e32 v4, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX900-NEXT: v_add_u32_e32 v4, -1, v4 +; GFX900-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX900-NEXT: v_min_u32_e32 v4, v4, v5 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX900-NEXT: v_ldexp_f32 v4, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX900-NEXT: v_add3_u32 v5, v0, v4, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX900-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX900-NEXT: v_min_u32_e32 v6, v0, v1 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v6 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v2i64_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v5, v2, v3 +; GFX950-NEXT: v_ffbh_i32_e32 v4, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX950-NEXT: v_add_u32_e32 v4, -1, v4 +; GFX950-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX950-NEXT: v_min_u32_e32 v4, v4, v5 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_xor_b32_e32 v5, v0, v1 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX950-NEXT: v_min_u32_e32 v3, v3, v5 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16: ; GFX10: ; %bb.0: @@ -34386,65 +36779,109 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v7, v4, v5 -; GFX9-NEXT: v_ffbh_i32_e32 v6, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 -; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1 -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 -; GFX9-NEXT: v_ffbh_i32_e32 v6, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 -; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc -; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v7, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v3i64_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v7, v4, v5 +; GFX900-NEXT: v_ffbh_i32_e32 v6, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX900-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX900-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX900-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX900-NEXT: v_xor_b32_e32 v7, v0, v1 +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX900-NEXT: v_ffbh_i32_e32 v6, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX900-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX900-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX900-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] +; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX900-NEXT: v_ldexp_f32 v5, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v5, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX900-NEXT: v_add3_u32 v6, v0, v5, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX900-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX900-NEXT: v_min_u32_e32 v7, v0, v1 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v7 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v4, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v3i64_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v7, v4, v5 +; GFX950-NEXT: v_ffbh_i32_e32 v6, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX950-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX950-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX950-NEXT: v_xor_b32_e32 v6, v2, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX950-NEXT: v_ffbh_i32_e32 v5, v3 +; GFX950-NEXT: v_add_u32_e32 v5, -1, v5 +; GFX950-NEXT: v_add_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_min_u32_e32 v5, v5, v6 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_xor_b32_e32 v6, v0, v1 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_add_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_min_u32_e32 v3, v3, v6 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v5 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16: ; GFX10: ; %bb.0: @@ -34842,82 +37279,137 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v9, v4, v5 -; GFX9-NEXT: v_ffbh_i32_e32 v8, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9 -; GFX9-NEXT: v_add_u32_e32 v8, -1, v8 -; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 -; GFX9-NEXT: v_min_u32_e32 v8, v8, v9 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v8, v4, v5 -; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v5, v6, v7 -; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v4, v7 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX9-NEXT: v_add_u32_e32 v4, -1, v4 -; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 -; GFX9-NEXT: v_min_u32_e32 v10, v4, v5 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1 -; GFX9-NEXT: v_ffbh_i32_e32 v7, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v7 -; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 -; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc -; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v8, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v4i64_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v9, v4, v5 +; GFX900-NEXT: v_ffbh_i32_e32 v8, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX900-NEXT: v_add_u32_e32 v8, -1, v8 +; GFX900-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX900-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v8, v4, v5 +; GFX900-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v5, v6, v7 +; GFX900-NEXT: v_add3_u32 v9, v4, v8, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v4, v7 +; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX900-NEXT: v_add_u32_e32 v4, -1, v4 +; GFX900-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX900-NEXT: v_min_u32_e32 v10, v4, v5 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_xor_b32_e32 v8, v0, v1 +; GFX900-NEXT: v_ffbh_i32_e32 v7, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GFX900-NEXT: v_add_u32_e32 v7, -1, v7 +; GFX900-NEXT: v_add_u32_e32 v8, 32, v8 +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_min_u32_e32 v7, v7, v8 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX900-NEXT: v_sub_u32_e32 v6, 32, v10 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX900-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc +; GFX900-NEXT: v_ldexp_f32 v6, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX900-NEXT: v_add3_u32 v7, v0, v6, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX900-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX900-NEXT: v_min_u32_e32 v8, v0, v1 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v4, v5, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v4i64_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v9, v6, v7 +; GFX950-NEXT: v_ffbh_i32_e32 v8, v7 +; GFX950-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX950-NEXT: v_add_u32_e32 v8, -1, v8 +; GFX950-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX950-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX950-NEXT: v_lshlrev_b64 v[6:7], v8, v[6:7] +; GFX950-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX950-NEXT: v_xor_b32_e32 v9, v4, v5 +; GFX950-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX950-NEXT: v_ffbh_i32_e32 v7, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX950-NEXT: v_add_u32_e32 v7, -1, v7 +; GFX950-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX950-NEXT: v_min_u32_e32 v7, v7, v9 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_cvt_f32_i32_e32 v6, v6 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX950-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX950-NEXT: v_sub_u32_e32 v6, 32, v7 +; GFX950-NEXT: v_xor_b32_e32 v7, v2, v3 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX950-NEXT: v_ffbh_i32_e32 v6, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX950-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX950-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_xor_b32_e32 v7, v0, v1 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_min_u32_e32 v3, v3, v7 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v4, v5 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16: ; GFX10: ; %bb.0: @@ -35202,18 +37694,25 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_i16_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_i16_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_i16_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_i16_to_bf16: ; GFX10: ; %bb.0: @@ -35306,25 +37805,33 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v2i16_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v2i16_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16: ; GFX10: ; %bb.0: @@ -35457,32 +37964,42 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v3i16_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v3i16_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16: ; GFX10: ; %bb.0: @@ -35656,38 +38173,49 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v4i16_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v4i16_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16: ; GFX10: ; %bb.0: @@ -35838,18 +38366,25 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_i32_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_i32_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_i32_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_i32_to_bf16: ; GFX10: ; %bb.0: @@ -35934,25 +38469,33 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v2i32_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v2i32_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16: ; GFX10: ; %bb.0: @@ -36071,32 +38614,42 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v3i32_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v3i32_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_cvt_f32_u32_e32 v3, v1 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16: ; GFX10: ; %bb.0: @@ -36248,38 +38801,49 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v4i32_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v4i32_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX950-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16: ; GFX10: ; %bb.0: @@ -36434,25 +38998,39 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_i64_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX9-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_i64_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX900-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_i64_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX950-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_i64_to_bf16: ; GFX10: ; %bb.0: @@ -36606,39 +39184,61 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v4, v1 -; GFX9-NEXT: v_min_u32_e32 v4, 32, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 -; GFX9-NEXT: v_ldexp_f32 v4, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v2i64_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v4, v1 +; GFX900-NEXT: v_min_u32_e32 v4, 32, v4 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX900-NEXT: v_ldexp_f32 v4, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v0, v4, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX900-NEXT: v_min_u32_e32 v6, 32, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v6 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v2i64_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v4, v3 +; GFX950-NEXT: v_min_u32_e32 v4, 32, v4 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX950-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16: ; GFX10: ; %bb.0: @@ -36874,53 +39474,85 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v6, v5 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 -; GFX9-NEXT: v_ffbh_u32_e32 v6, v1 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc -; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v7, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v3i64_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v6, v5 +; GFX900-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX900-NEXT: v_ffbh_u32_e32 v6, v1 +; GFX900-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] +; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX900-NEXT: v_ldexp_f32 v5, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v5, 16, 1 +; GFX900-NEXT: v_add3_u32 v6, v0, v5, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX900-NEXT: v_min_u32_e32 v7, 32, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v7 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v4, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v3i64_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v6, v5 +; GFX950-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX950-NEXT: v_ffbh_u32_e32 v5, v3 +; GFX950-NEXT: v_min_u32_e32 v5, 32, v5 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX950-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v5 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16: ; GFX10: ; %bb.0: @@ -37236,66 +39868,105 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v8, v5 -; GFX9-NEXT: v_min_u32_e32 v8, 32, v8 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v8, v4, v5 -; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v4, v7 -; GFX9-NEXT: v_min_u32_e32 v10, 32, v4 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX9-NEXT: v_ffbh_u32_e32 v7, v1 -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_min_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 -; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc -; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v8, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v4i64_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v8, v5 +; GFX900-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v8, v4, v5 +; GFX900-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX900-NEXT: v_add3_u32 v9, v4, v8, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v4, v7 +; GFX900-NEXT: v_min_u32_e32 v10, 32, v4 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] +; GFX900-NEXT: v_ffbh_u32_e32 v7, v1 +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_min_u32_e32 v7, 32, v7 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX900-NEXT: v_sub_u32_e32 v6, 32, v10 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX900-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc +; GFX900-NEXT: v_ldexp_f32 v6, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX900-NEXT: v_add3_u32 v7, v0, v6, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX900-NEXT: v_min_u32_e32 v8, 32, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v4, v5, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v4i64_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v8, v7 +; GFX950-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX950-NEXT: v_lshlrev_b64 v[6:7], v8, v[6:7] +; GFX950-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX950-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX950-NEXT: v_ffbh_u32_e32 v7, v5 +; GFX950-NEXT: v_min_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_cvt_f32_u32_e32 v6, v6 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX950-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX950-NEXT: v_sub_u32_e32 v6, 32, v7 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX950-NEXT: v_ffbh_u32_e32 v6, v3 +; GFX950-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX950-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v4, v5 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16: ; GFX10: ; %bb.0: @@ -37531,13 +40202,22 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_bf16: ; GFX10: ; %bb.0: @@ -37600,14 +40280,24 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_fneg_lhs_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_fneg_lhs_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_fneg_lhs_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_fneg_lhs_bf16: ; GFX10: ; %bb.0: @@ -37674,14 +40364,24 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_fneg_rhs_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_fneg_rhs_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_fneg_rhs_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_fneg_rhs_bf16: ; GFX10: ; %bb.0: @@ -37765,16 +40465,28 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v2bf16: ; GFX10: ; %bb.0: @@ -37859,18 +40571,32 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[0:1] +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v2bf16: ; GFX10: ; %bb.0: @@ -37946,15 +40672,27 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_bf16: ; GFX10: ; %bb.0: @@ -38046,21 +40784,39 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_lshr_b32 s2, s0, 16 +; GFX900-NEXT: s_lshr_b32 s3, s1, 16 +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: v_mov_b32_e32 v2, s2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s0 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_lshr_b32 s2, s0, 16 +; GFX950-NEXT: s_lshr_b32 s3, s1, 16 +; GFX950-NEXT: v_mov_b32_e32 v1, s3 +; GFX950-NEXT: v_mov_b32_e32 v2, s2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v1, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_v2bf16: ; GFX10: ; %bb.0: @@ -38159,22 +40915,42 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_vselect_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_vselect_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_lshr_b32 s2, s0, 16 +; GFX900-NEXT: s_lshr_b32 s3, s1, 16 +; GFX900-NEXT: v_mov_b32_e32 v2, s3 +; GFX900-NEXT: v_mov_b32_e32 v3, s2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_mov_b32_e32 v3, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_vselect_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_lshr_b32 s2, s0, 16 +; GFX950-NEXT: s_lshr_b32 s3, s1, 16 +; GFX950-NEXT: v_mov_b32_e32 v2, s3 +; GFX950-NEXT: v_mov_b32_e32 v3, s2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_mov_b32_e32 v3, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_vselect_v2bf16: ; GFX10: ; %bb.0: @@ -38285,14 +41061,24 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v3bf16: ; GFX10: ; %bb.0: @@ -38383,14 +41169,24 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v4bf16: ; GFX10: ; %bb.0: @@ -38504,15 +41300,26 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v6bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v6bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v6bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v6bf16: ; GFX10: ; %bb.0: @@ -38651,16 +41458,28 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v8bf16: ; GFX10: ; %bb.0: @@ -38900,20 +41719,36 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v16bf16: ; GFX10: ; %bb.0: @@ -39469,32 +42304,60 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_cndmask_b32_e32 v14, v31, v15, vcc +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v32bf16: ; GFX10: ; %bb.0: @@ -39604,19 +42467,34 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: v_readfirstlane_b32 s1, v1 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v1, s2 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, s3 +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: v_readfirstlane_b32 s1, v1 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_v3bf16: ; GFX10: ; %bb.0: @@ -39720,18 +42598,32 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: v_readfirstlane_b32 s0, v1 +; GFX900-NEXT: v_readfirstlane_b32 s1, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v1, s3 +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, s2 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: v_readfirstlane_b32 s1, v0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v1 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_v4bf16: ; GFX10: ; %bb.0: @@ -39854,34 +42746,66 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX8-NEXT: v_readfirstlane_b32 s1, v2 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_vselect_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s1, 0x5040100 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v2 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_vselect_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_lshr_b32 s4, s1, 16 +; GFX900-NEXT: s_lshr_b32 s5, s3, 16 +; GFX900-NEXT: v_mov_b32_e32 v4, s5 +; GFX900-NEXT: v_mov_b32_e32 v5, s4 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-NEXT: v_mov_b32_e32 v5, s1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s1, 0x5040100 +; GFX900-NEXT: s_lshr_b32 s3, s0, 16 +; GFX900-NEXT: s_lshr_b32 s4, s2, 16 +; GFX900-NEXT: v_perm_b32 v2, v3, v2, s1 +; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_mov_b32_e32 v3, s2 +; GFX900-NEXT: v_mov_b32_e32 v4, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s1 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: v_readfirstlane_b32 s1, v2 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_vselect_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_lshr_b32 s4, s1, 16 +; GFX950-NEXT: s_lshr_b32 s5, s3, 16 +; GFX950-NEXT: v_mov_b32_e32 v4, s5 +; GFX950-NEXT: v_mov_b32_e32 v5, s4 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX950-NEXT: s_lshr_b32 s4, s2, 16 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX950-NEXT: v_mov_b32_e32 v4, s3 +; GFX950-NEXT: v_mov_b32_e32 v5, s1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX950-NEXT: s_mov_b32 s1, 0x5040100 +; GFX950-NEXT: s_lshr_b32 s3, s0, 16 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX950-NEXT: v_perm_b32 v2, v3, v2, s1 +; GFX950-NEXT: v_mov_b32_e32 v3, s4 +; GFX950-NEXT: v_mov_b32_e32 v4, s3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX950-NEXT: v_mov_b32_e32 v3, s2 +; GFX950-NEXT: v_mov_b32_e32 v4, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s1 +; GFX950-NEXT: v_readfirstlane_b32 s1, v2 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_vselect_v4bf16: ; GFX10: ; %bb.0: @@ -40053,26 +42977,48 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX900-NEXT: s_mov_b64 vcc, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v1 +; GFX950-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[2:3] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX950-NEXT: s_mov_b64 vcc, s[0:1] +; GFX950-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v2, v1, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v4bf16: ; GFX10: ; %bb.0: @@ -40294,47 +43240,93 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v14 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v12 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 -; GFX9-NEXT: v_perm_b32 v3, v7, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX900-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX900-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX900-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX900-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v14 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX900-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v5, v4, s4 +; GFX900-NEXT: v_perm_b32 v3, v7, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX950-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0 +; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0 +; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v8bf16: ; GFX10: ; %bb.0: @@ -40803,85 +43795,171 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v10 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v8, 1, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v30, v22, s[8:9] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v8 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v30 -; GFX9-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v13, 1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[8:9] -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v21 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v29, v21, s[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v28 -; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v22, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v21, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v6, v10, v6, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v13, v8, v23, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 -; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4 -; GFX9-NEXT: v_perm_b32 v4, v11, v20, s4 -; GFX9-NEXT: v_perm_b32 v5, v12, v14, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v13, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX900-NEXT: v_and_b32_e32 v6, 1, v8 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 +; GFX900-NEXT: v_and_b32_e32 v6, 1, v10 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6 +; GFX900-NEXT: v_and_b32_e32 v6, 1, v12 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v6 +; GFX900-NEXT: v_and_b32_e32 v8, 1, v13 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v30, v22, s[8:9] +; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v8 +; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v22 +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v30 +; GFX900-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX900-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX900-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX900-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX900-NEXT: v_and_b32_e32 v13, 1, v14 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[8:9] +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v29, v21, s[6:7] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v29 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v11 +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5] +; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v27 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX900-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[6:7] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v28 +; GFX900-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v22, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX900-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX900-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v27, v21, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; GFX900-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX900-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v6, v10, v6, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e32 v13, v8, v23, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v16 +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v5, v4, s4 +; GFX900-NEXT: v_perm_b32 v3, v9, v19, s4 +; GFX900-NEXT: v_perm_b32 v4, v11, v20, s4 +; GFX900-NEXT: v_perm_b32 v5, v12, v14, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v13, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX950-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX950-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX950-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX950-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX950-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX950-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX950-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX950-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX950-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX950-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX950-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX950-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX950-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX950-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; GFX950-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v31 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v18, v32, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v25 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v18, v17, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0 +; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0 +; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0 +; GFX950-NEXT: v_perm_b32 v4, v9, v8, s0 +; GFX950-NEXT: v_perm_b32 v5, v11, v10, s0 +; GFX950-NEXT: v_perm_b32 v6, v13, v12, s0 +; GFX950-NEXT: v_perm_b32 v7, v15, v14, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v16bf16: ; GFX10: ; %bb.0: @@ -41981,205 +45059,438 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: v_writelane_b32 v33, s30, 0 -; GFX9-NEXT: v_writelane_b32 v33, s31, 1 -; GFX9-NEXT: v_writelane_b32 v33, s34, 2 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_writelane_b32 v33, s35, 3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35] -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95] -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91] -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] -; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4 -; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v6, v9, s4 -; GFX9-NEXT: v_perm_b32 v4, v8, v11, s4 -; GFX9-NEXT: v_perm_b32 v5, v10, v13, s4 -; GFX9-NEXT: v_perm_b32 v6, v12, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v14, v17, s4 -; GFX9-NEXT: v_perm_b32 v8, v16, v19, s4 -; GFX9-NEXT: v_perm_b32 v9, v18, v21, s4 -; GFX9-NEXT: v_perm_b32 v10, v20, v23, s4 -; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4 -; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4 -; GFX9-NEXT: v_perm_b32 v13, v26, v29, s4 -; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4 -; GFX9-NEXT: v_perm_b32 v15, v31, v30, s4 -; GFX9-NEXT: v_readlane_b32 s35, v33, 3 -; GFX9-NEXT: v_readlane_b32 s34, v33, 2 -; GFX9-NEXT: v_readlane_b32 s31, v33, 1 -; GFX9-NEXT: v_readlane_b32 s30, v33, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v5 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v4 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v7 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v9 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v8 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v11 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v10 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v13 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v12 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v15 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v14 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v17 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v16 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v19 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v18 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v21 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v20 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v23 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v22 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v25 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v24 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v27 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v26 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v29 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v28 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v33, s30, 0 +; GFX900-NEXT: v_writelane_b32 v33, s31, 1 +; GFX900-NEXT: v_writelane_b32 v33, s34, 2 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_writelane_b32 v33, s35, 3 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v30 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 +; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX900-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GFX900-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 +; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; GFX900-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 +; GFX900-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 +; GFX900-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 +; GFX900-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 +; GFX900-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 +; GFX900-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 +; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 +; GFX900-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 +; GFX900-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 +; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; GFX900-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 +; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35] +; GFX900-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX900-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95] +; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93] +; GFX900-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91] +; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] +; GFX900-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] +; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] +; GFX900-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] +; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] +; GFX900-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] +; GFX900-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] +; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] +; GFX900-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] +; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] +; GFX900-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] +; GFX900-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] +; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] +; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v5, s4 +; GFX900-NEXT: v_perm_b32 v2, v4, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v6, v9, s4 +; GFX900-NEXT: v_perm_b32 v4, v8, v11, s4 +; GFX900-NEXT: v_perm_b32 v5, v10, v13, s4 +; GFX900-NEXT: v_perm_b32 v6, v12, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v14, v17, s4 +; GFX900-NEXT: v_perm_b32 v8, v16, v19, s4 +; GFX900-NEXT: v_perm_b32 v9, v18, v21, s4 +; GFX900-NEXT: v_perm_b32 v10, v20, v23, s4 +; GFX900-NEXT: v_perm_b32 v11, v22, v25, s4 +; GFX900-NEXT: v_perm_b32 v12, v24, v27, s4 +; GFX900-NEXT: v_perm_b32 v13, v26, v29, s4 +; GFX900-NEXT: v_perm_b32 v14, v28, v32, s4 +; GFX900-NEXT: v_perm_b32 v15, v31, v30, s4 +; GFX900-NEXT: v_readlane_b32 s35, v33, 3 +; GFX900-NEXT: v_readlane_b32 s34, v33, 2 +; GFX900-NEXT: v_readlane_b32 s31, v33, 1 +; GFX900-NEXT: v_readlane_b32 s30, v33, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:60 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:124 +; GFX950-NEXT: scratch_load_ushort v33, off, s32 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:64 +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:128 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:120 +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:56 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116 +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:52 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:112 +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:48 +; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:88 +; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:24 +; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:92 +; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:108 +; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:44 +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:96 +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:32 +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:100 +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:36 +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:104 +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:40 +; GFX950-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v29 +; GFX950-NEXT: scratch_load_dword v29, off, s32 offset:84 +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:20 +; GFX950-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v26, 1, v26 +; GFX950-NEXT: v_and_b32_e32 v27, 1, v27 +; GFX950-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX950-NEXT: v_and_b32_e32 v25, 1, v25 +; GFX950-NEXT: v_and_b32_e32 v22, 1, v22 +; GFX950-NEXT: v_and_b32_e32 v23, 1, v23 +; GFX950-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX950-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX950-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX950-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX950-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX950-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX950-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX950-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX950-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX950-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX950-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX950-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX950-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX950-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX950-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX950-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshrrev_b32_e32 v46, 16, v31 +; GFX950-NEXT: s_waitcnt vmcnt(23) +; GFX950-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX950-NEXT: s_waitcnt vmcnt(22) +; GFX950-NEXT: v_and_b32_e32 v28, 1, v33 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:80 +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:16 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v28 +; GFX950-NEXT: v_and_b32_e32 v28, 1, v30 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28 +; GFX950-NEXT: scratch_load_dword v28, off, s32 offset:76 +; GFX950-NEXT: scratch_load_dword v30, off, s32 offset:12 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_lshrrev_b32_e32 v58, 16, v34 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshrrev_b32_e32 v59, 16, v35 +; GFX950-NEXT: v_cndmask_b32_e64 v34, v35, v34, s[4:5] +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:72 +; GFX950-NEXT: v_cndmask_b32_e64 v58, v59, v58, s[2:3] +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:8 +; GFX950-NEXT: v_cndmask_b32_e64 v31, v32, v31, s[0:1] +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:68 +; GFX950-NEXT: v_cndmask_b32_e32 v46, v47, v46, vcc +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:4 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_waitcnt vmcnt(26) +; GFX950-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v27 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v39 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v38 +; GFX950-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v25 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v25, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 +; GFX950-NEXT: s_waitcnt vmcnt(22) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v49 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v48 +; GFX950-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v23, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20 +; GFX950-NEXT: s_waitcnt vmcnt(16) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v55 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v54 +; GFX950-NEXT: v_cndmask_b32_e32 v20, v54, v55, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX950-NEXT: s_waitcnt vmcnt(10) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v44 +; GFX950-NEXT: v_cndmask_b32_e32 v18, v44, v45, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 +; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e32 v19, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v43 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v42 +; GFX950-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e32 v17, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v41 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v40 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v40, v41, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e32 v15, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v53 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v52 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v52, v53, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v51 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v50 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v50, v51, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX950-NEXT: s_waitcnt vmcnt(8) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v56 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v29, v56, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX950-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v29, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v57 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v33, v57, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX950-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v33, v29, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX950-NEXT: s_waitcnt vmcnt(4) +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v28, v30, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v28, v29, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v59 +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v35 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v35, v59, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v29, v28, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v47 +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v32, v47, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v29, v28, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0 +; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0 +; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0 +; GFX950-NEXT: v_perm_b32 v4, v9, v8, s0 +; GFX950-NEXT: v_perm_b32 v5, v11, v10, s0 +; GFX950-NEXT: v_perm_b32 v6, v13, v12, s0 +; GFX950-NEXT: v_perm_b32 v7, v15, v14, s0 +; GFX950-NEXT: v_perm_b32 v8, v17, v16, s0 +; GFX950-NEXT: v_perm_b32 v9, v19, v18, s0 +; GFX950-NEXT: v_perm_b32 v10, v21, v20, s0 +; GFX950-NEXT: v_perm_b32 v11, v23, v22, s0 +; GFX950-NEXT: v_perm_b32 v12, v25, v24, s0 +; GFX950-NEXT: v_perm_b32 v13, v27, v26, s0 +; GFX950-NEXT: v_perm_b32 v14, v46, v31, s0 +; GFX950-NEXT: v_perm_b32 v15, v58, v34, s0 +; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v32bf16: ; GFX10: ; %bb.0: @@ -42769,21 +46080,31 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_bf16: ; GFX10: ; %bb.0: @@ -42912,31 +46233,45 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_fma_f32 v3, v5, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_v2bf16: ; GFX10: ; %bb.0: @@ -43118,41 +46453,60 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX900-NEXT: v_fma_f32 v3, v6, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, s0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v6, v5 +; GFX950-NEXT: v_fmac_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v4, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_v3bf16: ; GFX10: ; %bb.0: @@ -43394,50 +46748,73 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX900-NEXT: v_fma_f32 v6, v8, v7, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX900-NEXT: v_fma_f32 v3, v7, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_fmac_f32_e32 v6, v8, v7 +; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX950-NEXT: v_fmac_f32_e32 v1, v7, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v3, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_v4bf16: ; GFX10: ; %bb.0: @@ -43640,28 +47017,41 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_bf16: ; GFX10: ; %bb.0: @@ -43839,45 +47229,65 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_v2bf16: ; GFX10: ; %bb.0: @@ -44145,62 +47555,90 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_v3bf16: ; GFX10: ; %bb.0: @@ -44560,78 +47998,113 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX950-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_v4bf16: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 7eb7d72..006fe51 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -766,10 +766,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr10, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr15, 0, $vgpr14, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir index 253e7e2..0e5ef3c 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir @@ -68,7 +68,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -149,7 +149,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir index 474ba71..a25c52f 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir @@ -69,7 +69,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -151,7 +151,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir index 4404f1a..ac8ef48 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir @@ -20,10 +20,10 @@ body: | ; CHECK-LABEL: name: foo1 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -41,10 +41,10 @@ body: | ; CHECK-LABEL: name: foo2 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -62,10 +62,10 @@ body: | ; CHECK-LABEL: name: foo3 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -83,10 +83,10 @@ body: | ; CHECK-LABEL: name: foo4 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 4cb0d2d..e6c38d2 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1046,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -2667,28 +2659,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -2699,7 +2684,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -3238,10 +3222,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll new file mode 100644 index 0000000..01ebe7d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll @@ -0,0 +1,298 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s + +/* TODO: Support safe bf16 fdiv lowering. +define bfloat @v_fdiv_bf16(bfloat %x, bfloat %y) { + %fdiv = fdiv bfloat %x, %y + ret bfloat %fdiv +} +*/ + +define bfloat @v_rcp_bf16(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv bfloat 1.0, %x + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_abs(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_abs: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, |v0.l| +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_abs: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, |v0| +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call bfloat @llvm.fabs.bf16(bfloat %x) + %fdiv = fdiv bfloat 1.0, %fabs + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_afn(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_afn: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_afn: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv afn bfloat 1.0, %x + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_neg(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_neg: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_neg: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv bfloat -1.0, %x + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_neg(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat -1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v1.l +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v1.h, v1.l +; GFX1250-TRUE16-NEXT: v_nop +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_multi_use: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + %r = insertelement <2 x bfloat> zeroinitializer, bfloat %x, i32 0 + %r2 = insertelement <2 x bfloat> %r, bfloat %fdiv, i32 1 + ret <2 x bfloat> %r2 +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_neg_rsq_bf16_missing_contract1(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_neg_rsq_bf16_missing_contract1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv bfloat -1.0, %sqrt + ret bfloat %fdiv +} + +define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) { +; GFX1250-TRUE16-LABEL: v_rsq_v2bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_v2bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a) + %fdiv = fdiv contract <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %sqrt + ret <2 x bfloat> %fdiv +} + +define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) { +; GFX1250-TRUE16-LABEL: v_neg_rsq_v2bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.h, -v0.h +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v1, -v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a) + %fdiv = fdiv contract <2 x bfloat> <bfloat -1.0, bfloat -1.0>, %sqrt + ret <2 x bfloat> %fdiv +} diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir new file mode 100644 index 0000000..e5955ad --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir @@ -0,0 +1,43 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=si-fold-operands -stop-after=prologepilog -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: test_fold_fi_scratch_load_vgpr +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } +body: | + bb.0.entry: + ; GCN-LABEL: name: test_fold_fi_scratch_load_vgpr + ; GCN: renamable $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + S_ENDPGM 0, implicit %1 + +... + +# SS form of the SCRATCH_LOAD_DWORD does not support offset scaling + +--- +name: test_no_fold_fi_scratch_load_vgpr_scale_offset +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } +body: | + bb.0.entry: + ; GCN-LABEL: name: test_no_fold_fi_scratch_load_vgpr_scale_offset + ; GCN: renamable $vgpr0 = V_MOV_B32_e32 $sgpr32, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = SCRATCH_LOAD_DWORD killed renamable $vgpr0, 4, 2048, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 2048, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + S_ENDPGM 0, implicit %1 + +... diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll new file mode 100644 index 0000000..b68786b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: basic_test + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[C]] + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[C1]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0 + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: unused_active + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 + ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + ret i32 14 +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: multiple_blocks + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s1), [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[ICMP]](s1) + ; CHECK-NEXT: G_BRCOND [[INT]](s1), %bb.2 + ; CHECK-NEXT: G_BR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.then: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.if.end: + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[COPY1]](s32), %bb.1, [[ADD]](s32), %bb.2 + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[PHI]] + ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { + ; CHECK-LABEL: name: ret_64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV]], [[C]] + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV1]], [[C1]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s64), [[SELECT1]](s64), 1, 1, 1, 0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INTRINSIC_CONVERGENT]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0, implicit $vgpr1 + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll new file mode 100644 index 0000000..3450d63 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll @@ -0,0 +1,191 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=DAGISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GISEL %s + +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: basic_test + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: basic_test + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY3]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: unused_active + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 14, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: unused_active + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14 + ; GISEL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ret i32 14 +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: multiple_blocks + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]] + ; DAGISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[COPY]], implicit $exec + ; DAGISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; DAGISEL-NEXT: S_BRANCH %bb.1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: bb.1.if.then: + ; DAGISEL-NEXT: successors: %bb.2(0x80000000) + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: bb.2.if.end: + ; DAGISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[V_ADD_U32_e64_]], %bb.1 + ; DAGISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]] + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY1]], [[COPY3]], implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: multiple_blocks + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + ; GISEL-NEXT: S_BRANCH %bb.2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: bb.2.if.then: + ; GISEL-NEXT: successors: %bb.3(0x80000000) + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: bb.3.if.end: + ; GISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.1, [[V_ADD_U32_e64_]], %bb.2 + ; GISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { + ; DAGISEL-LABEL: name: ret_64 + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; DAGISEL-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]] + ; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY5]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, killed [[COPY6]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY7]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; DAGISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[COPY8]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], killed [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; DAGISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]] + ; DAGISEL-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1 + ; + ; GISEL-LABEL: name: ret_64 + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_1]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_2]], 0, [[COPY2]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_3]], 0, [[COPY3]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; GISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1 + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll index b77b2f7..1ec4f25 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s @@ -19,6 +21,30 @@ define amdgpu_kernel void @v_alignbyte_b32(ptr addrspace(1) %out, i32 %src1, i32 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX9-LABEL: v_alignbyte_b32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbyte_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_alignbyte_b32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbyte_b32 v0, s0, s1, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; ; GFX11-TRUE16-LABEL: v_alignbyte_b32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 @@ -73,6 +99,41 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX9-LABEL: v_alignbyte_b32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x3c +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_alignbyte_b32 v1, v1, v2, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_alignbyte_b32_2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x3c +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbyte_b32 v0, v1, v0, s2 +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; ; GFX11-TRUE16-LABEL: v_alignbyte_b32_2: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll index 25889de..9565314 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll @@ -9,6 +9,172 @@ declare half @llvm.amdgcn.cvt.f16.fp8(i32, i32) declare <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16) declare <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16) +define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e32 v0.l, v0 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e32 v0.l, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 0) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 1) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:2 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 2) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 3) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.h, v0 byte_sel:3 +; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 3) + %ins.0 = insertelement <2 x half> undef, half 0.0, i32 0 + %ins.1 = insertelement <2 x half> %ins.0, half %cvt, i32 1 + %ret = bitcast <2 x half> %ins.1 to float + ret float %ret +} + define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte0: ; GFX1250-SDAG-REAL16: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll index 2f5ff90..9149ed5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll @@ -304,6 +304,556 @@ bb: ret void } +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 1, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 2, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v8i32(i32 4, <8 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb @@ -815,6 +1365,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) @@ -824,6 +1375,7 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) + declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll index fe8358f..12ea314 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll @@ -1342,6 +1342,110 @@ bb: ret void } +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34 +; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 +; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s1, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 +; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 +; GFX1250-NEXT: v_mov_b32_e32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb @@ -2227,6 +2331,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll index 9802144a..bf8308b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll @@ -1126,6 +1126,72 @@ bb: ret void } +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 1, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 3, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 4, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb @@ -1967,6 +2033,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll new file mode 100644 index 0000000..ced96ee --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.cos.bf16(bfloat) #0 + +define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 { +; GCN-LABEL: cos_bf16_constant_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_cos_bf16_e32 v0, 0x3f23 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %cos = call bfloat @llvm.cos.bf16(bfloat 4.0) #0 + store bfloat %cos, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 { +; GCN-LABEL: cos_bf16_constant_100: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_cos_bf16_e32 v0, 0x417f +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %cos = call bfloat @llvm.cos.bf16(bfloat 100.0) #0 + store bfloat %cos, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 978f223..8c1e166 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -5213,121 +5213,15 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 { } define float @v_exp_f32_undef() { -; VI-SDAG-LABEL: v_exp_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_rndne_f32_e32 v0, 0 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7fc00000 -; VI-SDAG-NEXT: v_add_f32_e64 v1, -v0, s4 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_exp_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sub_f32_e64 v0, s4, 0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v2, v0 -; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_exp_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; GFX900-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_exp_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; GFX900-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; SI-SDAG-LABEL: v_exp_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; SI-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; SI-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp_f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ; -; SI-GISEL-LABEL: v_exp_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; SI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp_f32_undef: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 70c3787..edc505b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -5291,121 +5291,15 @@ define float @v_exp10_f32_dynamic_mode(float %in) #1 { } define float @v_exp10_f32_undef() { -; VI-SDAG-LABEL: v_exp10_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_rndne_f32_e32 v0, 0 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7fc00000 -; VI-SDAG-NEXT: v_add_f32_e64 v1, -v0, s4 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_exp10_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sub_f32_e64 v0, s4, 0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v2, v0 -; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_exp10_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; GFX900-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_exp10_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; GFX900-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; SI-SDAG-LABEL: v_exp10_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; SI-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; SI-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp10_f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ; -; SI-GISEL-LABEL: v_exp10_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; SI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp10_f32_undef: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 15bcab9..e71ea50 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -2783,56 +2783,10 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 { } define float @v_exp2_f32_undef() { -; GCN-SDAG-LABEL: v_exp2_f32_undef: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, 0x7fc00000 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 -; SI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_exp2_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; VI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_exp2_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 -; GFX900-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp2_f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_undef: ; R600: ; %bb.0: @@ -4076,3 +4030,4 @@ attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN-GISEL: {{.*}} +; GCN-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 5634df5..38d1b47 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -5590,162 +5590,15 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { } define float @v_log_f32_undef() { -; SI-SDAG-LABEL: v_log_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 -; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf -; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; SI-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-SDAG-LABEL: v_log_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v0, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_log_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_log_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf -; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_log_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log_f32_undef: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_log_f32_undef: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-GISEL-LABEL: v_log_f32_undef: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_log_f32_undef: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 8d1a231..058933f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -5590,162 +5590,15 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { } define float @v_log10_f32_undef() { -; SI-SDAG-LABEL: v_log10_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a -; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf -; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; SI-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-SDAG-LABEL: v_log10_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v0, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_log10_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_log10_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf -; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_log10_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log10_f32_undef: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_log10_f32_undef: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-GISEL-LABEL: v_log10_f32_undef: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_log10_f32_undef: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 7ca72bf..4ca612a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -3542,45 +3542,15 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { } define float @v_log2_f32_undef() { -; GFX689-SDAG-LABEL: v_log2_f32_undef: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX689-GISEL-LABEL: v_log2_f32_undef: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX689-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log2_f32_undef: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_log2_f32_undef: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-GISEL-LABEL: v_log2_f32_undef: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_log2_f32_undef: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 355f77a..af914bd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -76,13 +76,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_movk_i32 s4, 0xfc01 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 -; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 ; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; SI-NEXT: v_not_b32_e32 v5, v5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll new file mode 100644 index 0000000..7a355a3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.sin.bf16(bfloat) #0 + +define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 { +; GCN-LABEL: sin_bf16_constant_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_sin_bf16_e32 v0, 0x3f23 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %sin = call bfloat @llvm.sin.bf16(bfloat 4.0) #0 + store bfloat %sin, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 { +; GCN-LABEL: sin_bf16_constant_100: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_sin_bf16_e32 v0, 0x417f +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %sin = call bfloat @llvm.sin.bf16(bfloat 100.0) #0 + store bfloat %sin, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir new file mode 100644 index 0000000..76e2092 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir @@ -0,0 +1,104 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: merge_global_load_dword_2_no_scale_offset +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_2_no_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 0, 1, implicit $exec :: (load (s64) from `ptr addrspace(1) undef` + 4, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[COPY]] + %0:sreg_64_xexec_xnull = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: no_merge_global_load_dword_2_same_scale_offset +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_2_same_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sreg_64_xexec_xnull = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2049, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2049, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: no_merge_global_load_dword_2_different_scale_offset +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_2_different_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2048, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sreg_64_xexec_xnull = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2048, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +# NB: We do not currently support merging SGPR offset and SGPR+Imm offset forms +# of S_LOAD, but the check stays the same: these cannot be merged with different +# scale offsets. +# +# We also do not currently merge flat scratch instructions, although a common +# check in the merge logic that CPol shall not be set for merge to happen. + +--- +name: merge_s_load_x1_x1_imm_no_scale_offset +body: | + bb.0: + ; GCN-LABEL: name: merge_s_load_x1_x1_imm_no_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) +... + +--- +name: no_merge_s_load_x1_x1_imm_same_scale_offset +body: | + bb.0: + ; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_same_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 2048 :: (dereferenceable invariant load (s32)) + ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32)) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 2048 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32)) +... + +--- +name: no_merge_s_load_x1_x1_imm_different_scale_offset +body: | + bb.0: + ; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_different_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) + ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32)) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32)) +... diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll index 047bdde..8281320 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll @@ -11,11 +11,13 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: #dbg_value(ptr addrspace(5) [[BUF_PTR_VAR]], [[META10:![0-9]+]], !DIExpression(), [[DBG21]]) ; CHECK-NEXT: [[AUX_PTR_VAR:%.*]] = alloca i160, align 32, addrspace(5), !dbg [[DBG22:![0-9]+]] ; CHECK-NEXT: #dbg_value(ptr addrspace(5) [[AUX_PTR_VAR]], [[META12:![0-9]+]], !DIExpression(), [[DBG22]]) -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META13:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) +; CHECK-NEXT: #dbg_value(i32 0, [[META13:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META23:![0-9]+]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF]], [[META13]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META23]]) ; CHECK-NEXT: [[BUF_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF]] to i160, !dbg [[DBG24:![0-9]+]] ; CHECK-NEXT: [[BUF_PTR_INT:%.*]] = shl nuw i160 [[BUF_PTR_INT_RSRC]], 32, !dbg [[DBG24]] ; CHECK-NEXT: store i160 [[BUF_PTR_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG24]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META15:![0-9]+]], !DIExpression(), [[META25:![0-9]+]]) +; CHECK-NEXT: #dbg_value(i32 0, [[META15:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META25:![0-9]+]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX]], [[META15]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META25]]) ; CHECK-NEXT: [[AUX_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[AUX]] to i160, !dbg [[DBG26:![0-9]+]] ; CHECK-NEXT: [[AUX_PTR_INT:%.*]] = shl nuw i160 [[AUX_PTR_INT_RSRC]], 32, !dbg [[DBG26]] ; CHECK-NEXT: store i160 [[AUX_PTR_INT]], ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG26]] @@ -24,10 +26,12 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128, !dbg [[DBG27]] ; CHECK-NEXT: [[BUF_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8), !dbg [[DBG27]] ; CHECK-NEXT: [[BUF_PTR_2_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_2]] to i32, !dbg [[DBG27]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META16:![0-9]+]], !DIExpression(), [[DBG27]]) +; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_2_PTR_OFF]], [[META16:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG27]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META16]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG27]]) ; CHECK-NEXT: [[BUF_PTR_3_IDX:%.*]] = mul i32 [[IDX]], 4, !dbg [[DBG28:![0-9]+]] ; CHECK-NEXT: [[BUF_PTR_3:%.*]] = add i32 [[BUF_PTR_2_PTR_OFF]], [[BUF_PTR_3_IDX]], !dbg [[DBG28]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META17:![0-9]+]], !DIExpression(), [[DBG28]]) +; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_3]], [[META17:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG28]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META17]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG28]]) ; CHECK-NEXT: [[BUF_PTR_3_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]] to i160, !dbg [[DBG29:![0-9]+]] ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[BUF_PTR_3_INT_RSRC]], 32, !dbg [[DBG29]] ; CHECK-NEXT: [[BUF_PTR_3_INT_OFF:%.*]] = zext i32 [[BUF_PTR_3]] to i160, !dbg [[DBG29]] @@ -38,7 +42,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128, !dbg [[DBG30]] ; CHECK-NEXT: [[BUF_PTR_4_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8), !dbg [[DBG30]] ; CHECK-NEXT: [[BUF_PTR_4_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_4]] to i32, !dbg [[DBG30]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META18:![0-9]+]], !DIExpression(), [[DBG30]]) +; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_4_PTR_OFF]], [[META18:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG30]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]], [[META18]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG30]]) ; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF_PTR_4_PTR_RSRC]], i32 [[BUF_PTR_4_PTR_OFF]], i32 0, i32 0), !dbg [[DBG31:![0-9]+]] ; CHECK-NEXT: #dbg_value(float [[RET]], [[META19:![0-9]+]], !DIExpression(), [[DBG31]]) ; CHECK-NEXT: [[AUX_PTR_2:%.*]] = load i160, ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG32:![0-9]+]] @@ -46,7 +51,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: [[TMP7:%.*]] = trunc i160 [[TMP6]] to i128, !dbg [[DBG32]] ; CHECK-NEXT: [[AUX_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP7]] to ptr addrspace(8), !dbg [[DBG32]] ; CHECK-NEXT: [[AUX_PTR_2_PTR_OFF:%.*]] = trunc i160 [[AUX_PTR_2]] to i32, !dbg [[DBG32]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META20:![0-9]+]], !DIExpression(), [[DBG32]]) +; CHECK-NEXT: #dbg_value(i32 [[AUX_PTR_2_PTR_OFF]], [[META20:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG32]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX_PTR_2_PTR_RSRC]], [[META20]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG32]]) ; CHECK-NEXT: [[BUF_PTR_4_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4]] to <5 x i32>, !dbg [[DBG33:![0-9]+]] ; CHECK-NEXT: [[BUF_PTR_4_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]] diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir index 409b1d6..ce67a2e 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir @@ -33,7 +33,7 @@ name: asm_write_vgpr_accvgpr_write_read body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -47,7 +47,7 @@ name: asm_write_vgpr_accvgpr_write_read_partialnop body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0 S_NOP 0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -60,7 +60,7 @@ name: asm_write_vgpr_accvgpr_write_read_otherreg body: | bb.0: liveins: $vgpr0 - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr1 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr1 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll index ae35d0d..e6bc733 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll @@ -17,6 +17,7 @@ ; CHECK-NEXT: .debug_mode: 0 ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll index 638dc89..310040d 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll @@ -19,6 +19,7 @@ ; CHECK-NEXT: .debug_mode: 0 ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: true ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0x200 diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll index fb6ac2e..c1846c0 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll @@ -59,6 +59,7 @@ ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true @@ -113,6 +114,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_gs ; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false @@ -124,6 +126,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_hs ; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .lds_size: 0x1000 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false @@ -135,6 +138,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_ps ; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll index 15778c8..5c0c366 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -62,6 +62,7 @@ ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0 @@ -118,6 +119,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_gs_main ; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true @@ -130,6 +132,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_hs_main ; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x1000 ; CHECK-NEXT: .mem_ordered: true @@ -142,6 +145,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_ps_main ; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll index 644722b..830872a 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll @@ -62,6 +62,7 @@ ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0 @@ -118,6 +119,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NOT: .entry_point: _amdgpu_gs_main ; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true @@ -130,6 +132,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NOT: .entry_point: _amdgpu_hs_main ; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x1000 ; CHECK-NEXT: .mem_ordered: true @@ -142,6 +145,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NOT: .entry_point: _amdgpu_ps_main ; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir index c9d0cf3..fef7332 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir @@ -45,13 +45,13 @@ body: | INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0 %14:vgpr_32 = COPY killed $agpr0 - INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 11534346 /* regdef:VReg_512 */, def %7, 10158090 /* regdef:VReg_256 */, def %8, 4784138 /* regdef:VReg_128 */, def %9, 3670026 /* regdef:VReg_96 */, def %10, 3670026 /* regdef:VReg_96 */, def %11 + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 27262986 /* regdef:VReg_512 */, def %7, 13565962 /* regdef:VReg_256 */, def %8, 6094858 /* regdef:VReg_128 */, def %9, 4784138 /* regdef:VReg_96 */, def %10, 4784138 /* regdef:VReg_96 */, def %11 INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 11534345 /* reguse:VReg_512 */, %7 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10158089 /* reguse:VReg_256 */, %8 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_128 */, %9 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %10 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %11 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 27262985 /* reguse:VReg_512 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 13565961 /* reguse:VReg_256 */, %8 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, %9 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %10 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %11 $agpr1 = COPY %14 INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1 SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 5d0e4bf..8fe68ba 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1084,10 +1076,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -1900,28 +1892,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -1932,7 +1917,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -2471,10 +2455,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll new file mode 100644 index 0000000..b5bb68e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll @@ -0,0 +1,372 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +; 'i32 %idx' is a signed index while SMRD soffset is unsigned, thus it is not selected. + +define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; SDAG-LABEL: s_load_b32_idx32: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_ashr_i32 s3, s2, 31 +; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0 +; SDAG-NEXT: s_wait_kmcnt 0x0 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_load_b32_idx32: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_ashr_i32 s3, s2, 31 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GISEL-NEXT: s_add_co_u32 s0, s0, s2 +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i32 %idx + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; SDAG-LABEL: s_load_b32_idxprom_wrong_stride: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_mov_b32 s3, 0 +; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0 +; SDAG-NEXT: s_wait_kmcnt 0x0 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_load_b32_idxprom_wrong_stride: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_mov_b32 s3, 0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; GISEL-NEXT: s_add_co_u32 s0, s0, s2 +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b256_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <8 x float> %ret +} + +define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b512_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <16 x float> %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxadd + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +; Note: this is a byte load, there is nothing to scale + +define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x10 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr addrspace(4) %p, i64 %idxadd + %ld = load i8, ptr addrspace(4) %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxprom + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b256_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <8 x float> %ret +} + +define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b512_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <16 x float> %ret +} + +!0 = !{i32 0, i32 1024} diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index a6b8ea3..6da7d1b 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1 ; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2 ; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0 ; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 @@ -6232,7 +6232,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8 ; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10 ; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 +; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir new file mode 100644 index 0000000..93f4891 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir @@ -0,0 +1,448 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=prologepilog -o - %s | FileCheck %s + +--- +name: save_inactive_lanes_non_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: save_inactive_lanes_non_csr_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 14, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: save_all_lanes_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: save_all_lanes_csr_vgpr + ; CHECK: liveins: $vgpr40 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr40 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr40 = V_MOV_B32_e32 14, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0 + +... +--- +name: save_csr_sgpr_to_non_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr191 + ; CHECK-LABEL: name: save_csr_sgpr_to_non_csr_vgpr + ; CHECK: liveins: $sgpr20, $vgpr191, $vgpr192 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr192, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192 + ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0 + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr192 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr20 = S_MOV_B32 14, implicit $exec + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: save_csr_sgpr_to_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr191 + ; CHECK-LABEL: name: save_csr_sgpr_to_csr_vgpr + ; CHECK: liveins: $sgpr20, $vgpr191 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr191, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + ; CHECK-NEXT: $vgpr191 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr20 = S_MOV_B32 14, implicit $exec + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: vgpr_and_sgpr_csr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 4 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + hasSpilledSGPRs: true + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + spillPhysVGPRs: + - '$vgpr191' + wwmReservedRegs: + - '$vgpr191' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191 + + ; CHECK-LABEL: name: vgpr_and_sgpr_csr + ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: split_orig_exec +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 4 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + hasSpilledSGPRs: true + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + spillPhysVGPRs: + - '$vgpr191' + wwmReservedRegs: + - '$vgpr191' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191 + + ; CHECK-LABEL: name: split_orig_exec + ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + ; CHECK-NEXT: $sgpr3 = COPY $vcc_lo + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr3, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr3 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3 + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + $sgpr3 = COPY $vcc_lo + S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3 + +... +--- +name: vgpr_superregs +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: vgpr_superregs + ; CHECK: liveins: $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr40, $vgpr41, $vgpr42 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr41, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr42, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5) + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5) + ; CHECK-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5) + ; CHECK-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 14, implicit $exec + S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: dont_restore_used_vgprs +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr20' } + - { reg: '$vgpr40' } +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr20, $vgpr40 + + ; CHECK-LABEL: name: dont_restore_used_vgprs + ; CHECK: liveins: $vgpr0, $vgpr20, $vgpr40 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40 + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: multiple_blocks +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + ; CHECK-LABEL: name: multiple_blocks + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; CHECK-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc + ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0 + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr1 + + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr1 = S_MOV_B32 $exec_lo + V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + + renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + + bb.2: + liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + + $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc + renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll new file mode 100644 index 0000000..53d0292 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -0,0 +1,2414 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL64 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL64 %s + +; Make sure the i1 %active is passed through EXEC. +; The EXEC mask should be set to -1 for the duration of the function +; and restored to its original value in the epilogue. +; We will also need to restore the inactive lanes for any allocated VGPRs. +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: basic_test: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: basic_test: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: basic_test: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: basic_test: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if there's only one use for %active. +define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: single_use_of_active: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: single_use_of_active: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: single_use_of_active: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: single_use_of_active: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %y = select i1 %active, i32 %b, i32 17 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: unused_active: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_mov_b32_e32 v0, 14 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: unused_active: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: v_mov_b32_e32 v0, 14 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: unused_active: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_mov_b32_e32 v0, 14 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: unused_active: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: v_mov_b32_e32 v0, 14 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + ret i32 14 +} + +; For any used VGPRs (including those used for SGPR spills), we need to restore the inactive lanes. +; For CSR VGPRs, we need to restore all lanes. +define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: csr: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: v_writelane_b32 v2, s20, 0 +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber non-CSR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; DAGISEL-NEXT: v_readlane_b32 s20, v2, 0 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_wait_alu 0xf1ff +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: csr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: v_writelane_b32 v2, s20, 0 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber non-CSR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; GISEL-NEXT: v_readlane_b32 s20, v2, 0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xf1ff +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: csr: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: v_writelane_b32 v2, s20, 0 +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber non-CSR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; DAGISEL64-NEXT: v_readlane_b32 s20, v2, 0 +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_wait_alu 0xf1ff +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: csr: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: v_writelane_b32 v2, s20, 0 +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber non-CSR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; GISEL64-NEXT: v_readlane_b32 s20, v2, 0 +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_wait_alu 0xf1ff +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + call void asm sideeffect "; clobber CSR", "~{v40},~{s48}"() + call void asm sideeffect "; clobber non-CSR", "~{v49},~{s20}"() + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Save and restore all lanes of v40. +define amdgpu_gfx_whole_wave void @csr_vgpr_only(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: csr_vgpr_only: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR VGPR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: csr_vgpr_only: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR VGPR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: csr_vgpr_only: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR VGPR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: csr_vgpr_only: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR VGPR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber CSR VGPR", "~{v40}"() + ret void +} + +define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: sgpr_spill_only: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_writelane_b32 v0, s68, 0 +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR SGPR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_readlane_b32 s68, v0, 0 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: sgpr_spill_only: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: v_writelane_b32 v0, s68, 0 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR SGPR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_readlane_b32 s68, v0, 0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: sgpr_spill_only: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_writelane_b32 v0, s68, 0 +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR SGPR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_readlane_b32 s68, v0, 0 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: sgpr_spill_only: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: v_writelane_b32 v0, s68, 0 +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR SGPR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_readlane_b32 s68, v0, 0 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber CSR SGPR", "~{s68}"() + ret void +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: multiple_blocks: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL-NEXT: s_mov_b32 s1, exec_lo +; DAGISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; DAGISEL-NEXT: ; %bb.1: ; %if.then +; DAGISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; DAGISEL-NEXT: ; %bb.2: ; %if.end +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: multiple_blocks: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s1, exec_lo +; GISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; GISEL-NEXT: ; %bb.1: ; %if.then +; GISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; GISEL-NEXT: ; %bb.2: ; %if.end +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: multiple_blocks: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL64-NEXT: s_mov_b64 s[2:3], exec +; DAGISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; DAGISEL64-NEXT: ; %bb.1: ; %if.then +; DAGISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; DAGISEL64-NEXT: ; %bb.2: ; %if.end +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_or_b64 exec, exec, s[2:3] +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: multiple_blocks: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL64-NEXT: s_mov_b64 s[2:3], exec +; GISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; GISEL64-NEXT: ; %bb.1: ; %if.then +; GISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; GISEL64-NEXT: ; %bb.2: ; %if.end +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { +; DAGISEL-LABEL: ret_64: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v1, 0, v1 :: v_dual_cndmask_b32 v0, 5, v0 +; DAGISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: ret_64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 0, v1 +; GISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: ret_64: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: ret_64: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} + +define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i32> inreg %v4i32, float inreg %float, ptr addrspace(5) inreg %ptr, ptr addrspace(5) inreg %ptr2) { +; DAGISEL-LABEL: inreg_args: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: s_clause 0x5 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s9 +; DAGISEL-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; DAGISEL-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s10 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b128 off, v[0:3], s11 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s11 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: s_clause 0x5 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: inreg_args: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s34, -1 +; GISEL-NEXT: s_clause 0x5 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_mov_b32 s0, s5 +; GISEL-NEXT: s_mov_b32 s1, s6 +; GISEL-NEXT: s_mov_b32 s2, s7 +; GISEL-NEXT: s_mov_b32 s3, s8 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v5, s9 +; GISEL-NEXT: scratch_store_b32 off, v4, s10 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b128 off, v[0:3], s11 +; GISEL-NEXT: scratch_store_b32 off, v5, s11 +; GISEL-NEXT: s_xor_b32 exec_lo, s34, -1 +; GISEL-NEXT: s_clause 0x5 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GISEL-NEXT: s_mov_b32 exec_lo, s34 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: inreg_args: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: s_clause 0x5 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_mov_b32_e32 v4, s4 +; DAGISEL64-NEXT: v_mov_b32_e32 v0, s5 +; DAGISEL64-NEXT: v_mov_b32_e32 v1, s6 +; DAGISEL64-NEXT: v_mov_b32_e32 v2, s7 +; DAGISEL64-NEXT: v_mov_b32_e32 v3, s8 +; DAGISEL64-NEXT: v_mov_b32_e32 v5, s9 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s10 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b128 off, v[0:3], s11 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s11 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: s_clause 0x5 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; DAGISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: inreg_args: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GISEL64-NEXT: s_clause 0x5 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_mov_b32 s0, s5 +; GISEL64-NEXT: s_mov_b32 s1, s6 +; GISEL64-NEXT: s_mov_b32 s2, s7 +; GISEL64-NEXT: s_mov_b32 s3, s8 +; GISEL64-NEXT: v_mov_b32_e32 v4, s4 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_mov_b32_e32 v0, s0 +; GISEL64-NEXT: v_mov_b32_e32 v1, s1 +; GISEL64-NEXT: v_mov_b32_e32 v2, s2 +; GISEL64-NEXT: v_mov_b32_e32 v3, s3 +; GISEL64-NEXT: v_mov_b32_e32 v5, s9 +; GISEL64-NEXT: scratch_store_b32 off, v4, s10 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b128 off, v[0:3], s11 +; GISEL64-NEXT: scratch_store_b32 off, v5, s11 +; GISEL64-NEXT: s_xor_b64 exec, s[34:35], -1 +; GISEL64-NEXT: s_clause 0x5 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GISEL64-NEXT: s_mov_b64 exec, s[34:35] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + store i32 %i32, ptr addrspace(5) %ptr + store <4 x i32> %v4i32, ptr addrspace(5) %ptr2 + store float %float, ptr addrspace(5) %ptr2 + ret void +} + +declare amdgpu_gfx <2 x half> @gfx_callee(<2 x half> %x, <2 x half> %y) + +define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 x half> %x, <2 x half> %y) { +; DAGISEL-LABEL: call_gfx_from_whole_wave: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_mov_b32 s0, s33 +; DAGISEL-NEXT: s_mov_b32 s33, s32 +; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; DAGISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; DAGISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; DAGISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; DAGISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; DAGISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; DAGISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; DAGISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; DAGISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; DAGISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; DAGISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; DAGISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; DAGISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; DAGISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; DAGISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; DAGISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; DAGISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; DAGISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; DAGISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; DAGISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; DAGISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; DAGISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; DAGISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; DAGISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; DAGISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; DAGISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; DAGISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; DAGISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; DAGISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; DAGISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; DAGISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; DAGISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; DAGISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; DAGISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; DAGISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; DAGISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; DAGISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; DAGISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; DAGISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; DAGISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; DAGISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; DAGISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; DAGISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; DAGISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; DAGISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; DAGISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; DAGISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; DAGISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; DAGISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; DAGISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; DAGISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; DAGISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; DAGISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; DAGISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; DAGISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; DAGISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; DAGISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; DAGISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; DAGISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; DAGISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; DAGISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; DAGISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; DAGISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; DAGISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; DAGISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; DAGISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; DAGISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; DAGISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; DAGISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; DAGISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; DAGISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; DAGISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; DAGISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; DAGISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; DAGISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; DAGISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; DAGISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; DAGISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; DAGISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; DAGISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; DAGISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; DAGISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; DAGISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; DAGISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; DAGISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; DAGISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; DAGISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; DAGISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; DAGISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; DAGISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; DAGISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; DAGISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; DAGISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; DAGISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; DAGISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; DAGISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; DAGISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; DAGISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; DAGISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; DAGISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; DAGISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; DAGISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; DAGISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; DAGISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; DAGISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; DAGISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; DAGISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; DAGISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; DAGISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; DAGISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; DAGISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; DAGISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; DAGISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; DAGISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; DAGISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; DAGISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; DAGISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; DAGISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; DAGISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; DAGISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; DAGISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; DAGISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; DAGISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; DAGISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; DAGISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; DAGISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; DAGISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; DAGISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_writelane_b32 v40, s0, 3 +; DAGISEL-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL-NEXT: v_swap_b32 v0, v1 +; DAGISEL-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; DAGISEL-NEXT: v_writelane_b32 v40, s4, 0 +; DAGISEL-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250 +; DAGISEL-NEXT: v_writelane_b32 v40, s30, 1 +; DAGISEL-NEXT: v_writelane_b32 v40, s31, 2 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2 +; DAGISEL-NEXT: v_readlane_b32 s30, v40, 1 +; DAGISEL-NEXT: v_readlane_b32 s4, v40, 0 +; DAGISEL-NEXT: v_readlane_b32 s0, v40, 3 +; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 s32, s33 +; DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; DAGISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; DAGISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; DAGISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; DAGISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; DAGISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; DAGISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; DAGISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; DAGISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; DAGISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; DAGISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; DAGISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; DAGISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; DAGISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; DAGISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; DAGISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; DAGISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; DAGISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; DAGISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; DAGISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; DAGISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; DAGISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; DAGISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; DAGISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; DAGISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; DAGISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; DAGISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; DAGISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; DAGISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; DAGISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; DAGISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; DAGISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; DAGISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; DAGISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; DAGISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; DAGISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; DAGISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; DAGISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; DAGISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; DAGISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; DAGISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; DAGISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; DAGISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; DAGISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; DAGISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; DAGISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; DAGISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; DAGISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; DAGISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; DAGISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; DAGISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; DAGISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; DAGISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; DAGISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; DAGISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; DAGISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; DAGISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; DAGISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; DAGISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; DAGISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; DAGISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; DAGISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; DAGISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; DAGISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; DAGISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; DAGISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; DAGISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; DAGISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; DAGISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; DAGISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; DAGISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; DAGISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; DAGISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; DAGISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; DAGISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; DAGISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; DAGISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; DAGISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; DAGISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; DAGISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; DAGISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; DAGISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; DAGISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; DAGISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; DAGISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; DAGISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; DAGISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; DAGISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; DAGISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; DAGISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; DAGISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; DAGISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; DAGISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; DAGISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; DAGISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; DAGISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; DAGISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; DAGISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; DAGISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; DAGISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; DAGISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; DAGISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; DAGISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; DAGISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; DAGISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; DAGISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; DAGISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; DAGISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; DAGISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; DAGISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; DAGISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; DAGISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; DAGISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; DAGISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; DAGISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; DAGISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; DAGISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; DAGISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; DAGISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; DAGISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; DAGISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; DAGISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; DAGISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; DAGISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; DAGISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; DAGISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; DAGISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; DAGISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; DAGISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; DAGISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s4 +; DAGISEL-NEXT: s_mov_b32 s33, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: call_gfx_from_whole_wave: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_mov_b32 s0, s33 +; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_xor_saveexec_b32 s4, -1 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; GISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; GISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; GISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; GISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; GISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; GISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; GISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; GISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; GISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; GISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; GISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; GISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; GISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; GISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; GISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; GISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; GISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; GISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; GISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; GISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; GISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; GISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; GISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; GISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; GISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; GISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; GISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; GISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; GISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; GISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; GISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; GISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; GISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; GISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; GISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; GISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; GISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; GISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; GISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; GISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; GISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; GISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; GISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; GISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; GISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; GISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; GISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; GISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; GISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; GISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; GISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; GISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; GISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; GISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; GISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; GISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; GISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; GISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; GISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; GISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; GISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; GISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; GISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; GISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; GISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; GISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; GISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; GISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; GISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; GISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; GISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; GISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; GISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; GISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; GISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; GISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; GISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; GISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; GISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; GISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; GISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; GISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; GISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; GISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; GISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; GISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; GISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; GISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; GISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; GISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; GISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; GISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; GISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; GISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; GISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; GISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; GISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; GISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; GISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; GISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; GISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; GISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; GISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; GISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; GISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; GISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; GISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; GISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; GISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; GISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; GISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; GISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; GISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; GISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; GISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; GISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; GISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; GISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; GISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; GISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; GISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; GISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; GISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; GISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; GISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; GISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; GISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; GISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; GISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; GISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; GISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; GISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; GISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; GISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; GISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; GISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; GISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_writelane_b32 v40, s0, 3 +; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: v_swap_b32 v0, v1 +; GISEL-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; GISEL-NEXT: v_writelane_b32 v40, s4, 0 +; GISEL-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; GISEL-NEXT: s_addk_co_i32 s32, 0x250 +; GISEL-NEXT: v_writelane_b32 v40, s30, 1 +; GISEL-NEXT: v_writelane_b32 v40, s31, 2 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_readlane_b32 s31, v40, 2 +; GISEL-NEXT: v_readlane_b32 s30, v40, 1 +; GISEL-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL-NEXT: v_readlane_b32 s0, v40, 3 +; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 s32, s33 +; GISEL-NEXT: s_xor_b32 exec_lo, s4, -1 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; GISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; GISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; GISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; GISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; GISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; GISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; GISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; GISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; GISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; GISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; GISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; GISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; GISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; GISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; GISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; GISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; GISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; GISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; GISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; GISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; GISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; GISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; GISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; GISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; GISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; GISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; GISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; GISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; GISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; GISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; GISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; GISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; GISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; GISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; GISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; GISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; GISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; GISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; GISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; GISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; GISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; GISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; GISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; GISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; GISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; GISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; GISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; GISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; GISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; GISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; GISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; GISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; GISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; GISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; GISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; GISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; GISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; GISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; GISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; GISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; GISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; GISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; GISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; GISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; GISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; GISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; GISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; GISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; GISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; GISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; GISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; GISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; GISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; GISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; GISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; GISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; GISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; GISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; GISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; GISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; GISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; GISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; GISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; GISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; GISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; GISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; GISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; GISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; GISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; GISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; GISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; GISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; GISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; GISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; GISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; GISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; GISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; GISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; GISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; GISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; GISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; GISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; GISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; GISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; GISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; GISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; GISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; GISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; GISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; GISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; GISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; GISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; GISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; GISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; GISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; GISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; GISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; GISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; GISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; GISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; GISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; GISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; GISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; GISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; GISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; GISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; GISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; GISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; GISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; GISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; GISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; GISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; GISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; GISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; GISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; GISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; GISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; GISEL-NEXT: s_mov_b32 exec_lo, s4 +; GISEL-NEXT: s_mov_b32 s33, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: call_gfx_from_whole_wave: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_mov_b32 s0, s33 +; DAGISEL64-NEXT: s_mov_b32 s33, s32 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; DAGISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; DAGISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; DAGISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; DAGISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; DAGISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; DAGISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; DAGISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; DAGISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; DAGISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; DAGISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; DAGISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; DAGISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; DAGISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; DAGISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; DAGISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; DAGISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; DAGISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; DAGISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; DAGISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; DAGISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; DAGISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; DAGISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; DAGISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; DAGISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; DAGISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; DAGISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; DAGISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; DAGISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; DAGISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; DAGISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; DAGISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; DAGISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; DAGISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; DAGISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; DAGISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; DAGISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; DAGISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; DAGISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; DAGISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; DAGISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; DAGISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; DAGISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; DAGISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; DAGISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; DAGISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; DAGISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; DAGISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; DAGISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; DAGISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; DAGISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; DAGISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; DAGISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; DAGISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; DAGISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; DAGISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; DAGISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; DAGISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; DAGISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; DAGISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; DAGISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; DAGISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; DAGISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; DAGISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; DAGISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; DAGISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; DAGISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; DAGISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; DAGISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; DAGISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; DAGISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; DAGISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; DAGISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; DAGISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; DAGISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; DAGISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; DAGISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; DAGISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; DAGISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; DAGISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; DAGISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; DAGISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; DAGISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; DAGISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; DAGISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; DAGISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; DAGISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; DAGISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; DAGISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; DAGISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; DAGISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; DAGISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; DAGISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; DAGISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; DAGISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; DAGISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; DAGISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; DAGISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; DAGISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; DAGISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; DAGISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; DAGISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; DAGISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; DAGISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; DAGISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; DAGISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; DAGISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; DAGISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; DAGISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; DAGISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; DAGISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; DAGISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; DAGISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; DAGISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; DAGISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; DAGISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; DAGISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; DAGISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; DAGISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; DAGISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; DAGISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; DAGISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; DAGISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; DAGISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; DAGISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; DAGISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; DAGISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; DAGISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; DAGISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; DAGISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; DAGISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; DAGISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; DAGISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; DAGISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; DAGISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_writelane_b32 v40, s0, 4 +; DAGISEL64-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL64-NEXT: v_swap_b32 v0, v1 +; DAGISEL64-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; DAGISEL64-NEXT: v_writelane_b32 v40, s4, 0 +; DAGISEL64-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x250 +; DAGISEL64-NEXT: v_writelane_b32 v40, s5, 1 +; DAGISEL64-NEXT: v_writelane_b32 v40, s30, 2 +; DAGISEL64-NEXT: v_writelane_b32 v40, s31, 3 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3 +; DAGISEL64-NEXT: v_readlane_b32 s30, v40, 2 +; DAGISEL64-NEXT: v_readlane_b32 s5, v40, 1 +; DAGISEL64-NEXT: v_readlane_b32 s4, v40, 0 +; DAGISEL64-NEXT: v_readlane_b32 s0, v40, 4 +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b32 s32, s33 +; DAGISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; DAGISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; DAGISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; DAGISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; DAGISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; DAGISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; DAGISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; DAGISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; DAGISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; DAGISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; DAGISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; DAGISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; DAGISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; DAGISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; DAGISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; DAGISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; DAGISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; DAGISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; DAGISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; DAGISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; DAGISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; DAGISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; DAGISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; DAGISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; DAGISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; DAGISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; DAGISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; DAGISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; DAGISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; DAGISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; DAGISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; DAGISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; DAGISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; DAGISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; DAGISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; DAGISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; DAGISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; DAGISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; DAGISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; DAGISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; DAGISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; DAGISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; DAGISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; DAGISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; DAGISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; DAGISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; DAGISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; DAGISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; DAGISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; DAGISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; DAGISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; DAGISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; DAGISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; DAGISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; DAGISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; DAGISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; DAGISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; DAGISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; DAGISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; DAGISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; DAGISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; DAGISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; DAGISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; DAGISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; DAGISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; DAGISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; DAGISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; DAGISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; DAGISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; DAGISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; DAGISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; DAGISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; DAGISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; DAGISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; DAGISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; DAGISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; DAGISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; DAGISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; DAGISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; DAGISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; DAGISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; DAGISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; DAGISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; DAGISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; DAGISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; DAGISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; DAGISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; DAGISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; DAGISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; DAGISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; DAGISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; DAGISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; DAGISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; DAGISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; DAGISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; DAGISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; DAGISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; DAGISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; DAGISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; DAGISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; DAGISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; DAGISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; DAGISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; DAGISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; DAGISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; DAGISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; DAGISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; DAGISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; DAGISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; DAGISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; DAGISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; DAGISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; DAGISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; DAGISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; DAGISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; DAGISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; DAGISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; DAGISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; DAGISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; DAGISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; DAGISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; DAGISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; DAGISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; DAGISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; DAGISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; DAGISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; DAGISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; DAGISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; DAGISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; DAGISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; DAGISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; DAGISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; DAGISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; DAGISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; DAGISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; DAGISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; DAGISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; DAGISEL64-NEXT: s_mov_b64 exec, s[4:5] +; DAGISEL64-NEXT: s_mov_b32 s33, s0 +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: call_gfx_from_whole_wave: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_mov_b32 s0, s33 +; GISEL64-NEXT: s_mov_b32 s33, s32 +; GISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; GISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; GISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; GISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; GISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; GISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; GISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; GISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; GISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; GISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; GISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; GISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; GISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; GISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; GISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; GISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; GISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; GISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; GISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; GISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; GISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; GISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; GISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; GISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; GISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; GISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; GISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; GISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; GISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; GISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; GISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; GISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; GISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; GISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; GISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; GISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; GISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; GISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; GISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; GISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; GISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; GISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; GISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; GISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; GISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; GISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; GISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; GISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; GISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; GISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; GISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; GISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; GISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; GISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; GISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; GISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; GISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; GISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; GISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; GISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; GISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; GISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; GISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; GISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; GISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; GISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; GISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; GISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; GISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; GISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; GISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; GISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; GISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; GISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; GISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; GISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; GISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; GISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; GISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; GISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; GISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; GISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; GISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; GISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; GISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; GISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; GISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; GISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; GISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; GISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; GISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; GISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; GISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; GISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; GISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; GISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; GISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; GISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; GISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; GISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; GISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; GISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; GISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; GISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; GISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; GISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; GISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; GISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; GISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; GISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; GISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; GISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; GISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; GISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; GISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; GISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; GISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; GISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; GISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; GISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; GISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; GISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; GISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; GISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; GISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; GISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; GISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; GISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; GISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; GISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; GISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; GISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; GISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; GISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; GISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; GISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; GISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; GISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_writelane_b32 v40, s0, 4 +; GISEL64-NEXT: v_mov_b32_e32 v2, v0 +; GISEL64-NEXT: v_swap_b32 v0, v1 +; GISEL64-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; GISEL64-NEXT: v_writelane_b32 v40, s4, 0 +; GISEL64-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; GISEL64-NEXT: s_addk_co_i32 s32, 0x250 +; GISEL64-NEXT: v_writelane_b32 v40, s5, 1 +; GISEL64-NEXT: v_writelane_b32 v40, s30, 2 +; GISEL64-NEXT: v_writelane_b32 v40, s31, 3 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_readlane_b32 s31, v40, 3 +; GISEL64-NEXT: v_readlane_b32 s30, v40, 2 +; GISEL64-NEXT: v_readlane_b32 s5, v40, 1 +; GISEL64-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL64-NEXT: v_readlane_b32 s0, v40, 4 +; GISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b32 s32, s33 +; GISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; GISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; GISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; GISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; GISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; GISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; GISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; GISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; GISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; GISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; GISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; GISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; GISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; GISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; GISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; GISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; GISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; GISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; GISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; GISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; GISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; GISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; GISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; GISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; GISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; GISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; GISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; GISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; GISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; GISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; GISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; GISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; GISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; GISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; GISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; GISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; GISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; GISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; GISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; GISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; GISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; GISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; GISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; GISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; GISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; GISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; GISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; GISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; GISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; GISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; GISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; GISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; GISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; GISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; GISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; GISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; GISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; GISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; GISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; GISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; GISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; GISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; GISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; GISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; GISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; GISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; GISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; GISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; GISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; GISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; GISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; GISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; GISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; GISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; GISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; GISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; GISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; GISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; GISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; GISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; GISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; GISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; GISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; GISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; GISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; GISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; GISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; GISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; GISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; GISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; GISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; GISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; GISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; GISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; GISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; GISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; GISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; GISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; GISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; GISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; GISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; GISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; GISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; GISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; GISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; GISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; GISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; GISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; GISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; GISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; GISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; GISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; GISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; GISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; GISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; GISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; GISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; GISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; GISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; GISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; GISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; GISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; GISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; GISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; GISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; GISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; GISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; GISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; GISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; GISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; GISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; GISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; GISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; GISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; GISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; GISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; GISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; GISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; GISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; GISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; GISEL64-NEXT: s_mov_b64 exec, s[4:5] +; GISEL64-NEXT: s_mov_b32 s33, s0 +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent + ret <2 x half> %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir new file mode 100644 index 0000000..2f7a6e2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir @@ -0,0 +1,902 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1250 %s + +# WMMA writes: D0, WMMA reads: A0/B0/Index0 +# VALU writes: D1, VALU reads: Use1 +# Hards could be: +# RAW: D0 overlaps Use1 +# WAW: D0 overlaps D1 +# WAR: A0/B0/Index0 overlaps D1 + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec + ; GFX1250-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec + ; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec + ; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec + ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr26 = V_MOV_B32_e32 26, implicit $exec + $vgpr27 = V_MOV_B32_e32 27, implicit $exec + $vgpr28 = V_MOV_B32_e32 28, implicit $exec + $vgpr29 = V_MOV_B32_e32 29, implicit $exec + $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 3 + $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_valus_in_between + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec + ; GFX1250-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec + ; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec + ; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec + ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr22, $vgpr30, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr26 = V_MOV_B32_e32 26, implicit $exec + $vgpr27 = V_MOV_B32_e32 27, implicit $exec + $vgpr28 = V_MOV_B32_e32 28, implicit $exec + $vgpr29 = V_MOV_B32_e32 29, implicit $exec + $vgpr31 = V_ADD_F32_e32 $vgpr22, $vgpr30, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_salus_in_between + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 3 + $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_valus_in_between + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec + ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec + ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr42 = V_MOV_B32_e32 42, implicit $exec + $vgpr43 = V_MOV_B32_e32 43, implicit $exec + $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_salus_in_between + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 3 + $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec + ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec + ; GFX1250-NEXT: $vgpr44 = V_MOV_B32_e32 44, implicit $exec + ; GFX1250-NEXT: $vgpr45 = V_MOV_B32_e32 45, implicit $exec + ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec + ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr42 = V_MOV_B32_e32 42, implicit $exec + $vgpr43 = V_MOV_B32_e32 43, implicit $exec + $vgpr44 = V_MOV_B32_e32 44, implicit $exec + $vgpr45 = V_MOV_B32_e32 45, implicit $exec + $vgpr46 = V_MOV_B32_e32 46, implicit $exec + $vgpr47 = V_MOV_B32_e32 47, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3 + ; GFX1250-NEXT: $sgpr4 = S_MOV_B32 4 + ; GFX1250-NEXT: $sgpr5 = S_MOV_B32 5 + ; GFX1250-NEXT: $sgpr6 = S_MOV_B32 6 + ; GFX1250-NEXT: $sgpr7 = S_MOV_B32 7 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 3 + $sgpr4 = S_MOV_B32 4 + $sgpr5 = S_MOV_B32 5 + $sgpr6 = S_MOV_B32 6 + $sgpr7 = S_MOV_B32 7 + $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec + $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_valus_in_between + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec + ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec + ; GFX1250-NEXT: $vgpr44 = V_MOV_B32_e32 44, implicit $exec + ; GFX1250-NEXT: $vgpr45 = V_MOV_B32_e32 45, implicit $exec + ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec + ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec + ; GFX1250-NEXT: $vgpr49 = V_ADD_F32_e32 $vgpr32, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr42 = V_MOV_B32_e32 42, implicit $exec + $vgpr43 = V_MOV_B32_e32 43, implicit $exec + $vgpr44 = V_MOV_B32_e32 44, implicit $exec + $vgpr45 = V_MOV_B32_e32 45, implicit $exec + $vgpr46 = V_MOV_B32_e32 46, implicit $exec + $vgpr47 = V_MOV_B32_e32 47, implicit $exec + $vgpr49 = V_ADD_F32_e32 $vgpr32, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_salus_in_between + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3 + ; GFX1250-NEXT: $sgpr4 = S_MOV_B32 4 + ; GFX1250-NEXT: $sgpr5 = S_MOV_B32 5 + ; GFX1250-NEXT: $sgpr6 = S_MOV_B32 6 + ; GFX1250-NEXT: $sgpr7 = S_MOV_B32 7 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 3 + $sgpr4 = S_MOV_B32 4 + $sgpr5 = S_MOV_B32 5 + $sgpr6 = S_MOV_B32 6 + $sgpr7 = S_MOV_B32 7 + $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_valus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_salus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_Index0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_Index0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_valus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_salus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_Index0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_Index0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec + ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr42 = V_MOV_B32_e32 42, implicit $exec + $vgpr43 = V_MOV_B32_e32 43, implicit $exec + $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 4 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 4 + $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir new file mode 100644 index 0000000..2032b98 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir @@ -0,0 +1,1430 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1250 %s + +# For two conscutive wmma instructions, we need to insert one V_NOP instruction between +# them if matrix A, B or index of the second wmma are the same or overlap with previous +# wmma instruction’s D-matrix. + +--- +name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr4_vgpr5, 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr4_vgpr5, 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr14_vgpr15, 8, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr14_vgpr15, 8, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec + $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec + $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_f16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_f16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_f16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x64_f16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x64_f16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x64_f16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll index 3562b93..9e1aa10 100644 --- a/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll +++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-backward.ll @@ -1,28 +1,21 @@ ; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s -; RUN: llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - | llvm-objdump --mcpu=avr25 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR25 %s ; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s ; ATTINY85: <main>: ; ATTINY85-NEXT: andi r24, 0x1 ; ATTINY85: cpi r24, 0x0 -; ATTINY85-NEXT: breq .+2 -; ATTINY85-NEXT: rjmp .+4086 +; ATTINY85-NEXT: breq .-2 +; ATTINY85-NEXT: R_AVR_7_PCREL .text+0x100c +; ATTINY85-NEXT: rjmp .-2 +; ATTINY85-NEXT: R_AVR_13_PCREL .text+0x2 ; ATTINY85: ldi r24, 0x3 ; ATTINY85-NEXT: ret -; AVR25: <main>: -; AVR25-NEXT: andi r24, 0x1 -; AVR25: cpi r24, 0x0 -; AVR25-NEXT: breq .+2 -; AVR25-NEXT: rjmp .-2 -; AVR25-NEXT: R_AVR_13_PCREL .text+0x2 -; AVR25: ldi r24, 0x3 -; AVR25-NEXT: ret - ; AVR3: <main>: ; AVR3-NEXT: andi r24, 0x1 ; AVR3: cpi r24, 0x0 -; AVR3-NEXT: breq .+4 +; AVR3-NEXT: breq .-2 +; AVR3-NEXT: R_AVR_7_PCREL .text+0x100e ; AVR3-NEXT: jmp 0x0 ; AVR3-NEXT: R_AVR_CALL .text+0x2 ; AVR3: ldi r24, 0x3 diff --git a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll index a51cf42..1fc84a7 100644 --- a/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll +++ b/llvm/test/CodeGen/AVR/branch-relaxation-long-forward.ll @@ -1,28 +1,21 @@ ; RUN: llc < %s -mtriple=avr -mcpu=attiny85 -filetype=obj -o - | llvm-objdump --mcpu=attiny85 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=ATTINY85 %s -; RUN: llc < %s -mtriple=avr -mcpu=avr25 -filetype=obj -o - | llvm-objdump --mcpu=avr25 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR25 %s ; RUN: llc < %s -mtriple=avr -mcpu=avr3 -filetype=obj -o - | llvm-objdump --mcpu=avr3 -dr --no-show-raw-insn --no-leading-addr - | FileCheck --check-prefix=AVR3 %s ; ATTINY85: <main>: ; ATTINY85-NEXT: andi r24, 0x1 ; ATTINY85-NEXT: cpi r24, 0x0 -; ATTINY85-NEXT: brne .+2 -; ATTINY85-NEXT: rjmp .-4092 +; ATTINY85-NEXT: brne .-2 +; ATTINY85-NEXT: R_AVR_7_PCREL .text+0x8 +; ATTINY85-NEXT: rjmp .-2 +; ATTINY85-NEXT: R_AVR_13_PCREL .text+0x100c ; ATTINY85: ldi r24, 0x3 ; ATTINY85-NEXT: ret -; AVR25: <main>: -; AVR25-NEXT: andi r24, 0x1 -; AVR25-NEXT: cpi r24, 0x0 -; AVR25-NEXT: brne .+2 -; AVR25-NEXT: rjmp .-2 -; AVR25-NEXT: R_AVR_13_PCREL .text+0x100c -; AVR25: ldi r24, 0x3 -; AVR25-NEXT: ret - ; AVR3: <main>: ; AVR3-NEXT: andi r24, 0x1 ; AVR3-NEXT: cpi r24, 0x0 -; AVR3-NEXT: brne .+4 +; AVR3-NEXT: brne .-2 +; AVR3-NEXT: R_AVR_7_PCREL .text+0xa ; AVR3-NEXT: jmp 0x0 ; AVR3-NEXT: R_AVR_CALL .text+0x100e ; AVR3: ldi r24, 0x3 diff --git a/llvm/test/CodeGen/AVR/jmp.ll b/llvm/test/CodeGen/AVR/jmp.ll index 95dfff4..1cbc637 100644 --- a/llvm/test/CodeGen/AVR/jmp.ll +++ b/llvm/test/CodeGen/AVR/jmp.ll @@ -18,7 +18,8 @@ declare i8 @bar(i8); ; CHECK: rcall .-2 ; CHECK-NEXT: 00000000: R_AVR_13_PCREL bar ; CHECK-NEXT: cpi r24, 0x7b -; CHECK-NEXT: brne .+4 +; CHECK-NEXT: brne .-2 +; CHECK-NEXT: R_AVR_7_PCREL .text+0xa ; CHECK-NEXT: ldi r24, 0x64 ; CHECK-NEXT: ret ; CHECK-NEXT: ldi r24, 0xc8 diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/lifetimes-noint64op.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/lifetimes-noint64op.ll deleted file mode 100644 index 736c86e..0000000 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/lifetimes-noint64op.ll +++ /dev/null @@ -1,36 +0,0 @@ -; RUN: opt -S --passes="print-dx-shader-flags" 2>&1 %s | FileCheck %s -; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC - -target triple = "dxil-pc-shadermodel6.7-library" - -; CHECK: ; Combined Shader Flags for Module -; CHECK-NEXT: ; Shader Flags Value: 0x00000000 -; CHECK-NEXT: ; -; CHECK-NOT: ; Note: shader requires additional functionality: -; CHECK-NOT: ; 64-Bit integer -; CHECK-NOT: ; Note: extra DXIL module flags: -; CHECK-NOT: ; -; CHECK-NEXT: ; Shader Flags for Module Functions -; CHECK-NEXT: ; Function lifetimes : 0x00000000 - -define void @lifetimes() #0 { - %a = alloca [4 x i32], align 8 - call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %a) - call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %a) - ret void -} - -; Function Attrs: nounwind memory(argmem: readwrite) -declare void @llvm.lifetime.start.p0(i64, ptr) #1 - -; Function Attrs: nounwind memory(argmem: readwrite) -declare void @llvm.lifetime.end.p0(i64, ptr) #1 - -attributes #0 = { convergent norecurse nounwind "hlsl.export"} -attributes #1 = { nounwind memory(argmem: readwrite) } - -; DXC: - Name: SFI0 -; DXC-NEXT: Size: 8 -; DXC-NOT: Flags: -; DXC-NOT: Int64Ops: true -; DXC: ... diff --git a/llvm/test/CodeGen/DirectX/UAddc.ll b/llvm/test/CodeGen/DirectX/UAddc.ll index 4b46b56..dd7aa23 100644 --- a/llvm/test/CodeGen/DirectX/UAddc.ll +++ b/llvm/test/CodeGen/DirectX/UAddc.ll @@ -35,14 +35,10 @@ define noundef <2 x i32> @test_UAddc_vec2(<2 x i32> noundef %a, <2 x i32> nounde ; CHECK-NEXT: [[UADDC_I1:%.*]] = call [[DX_TYPES_I32C]] @dx.op.binaryWithCarryOrBorrow.i32(i32 44, i32 [[A_I1]], i32 [[B_I1]]) #[[ATTR0]] ; CHECK-NEXT: [[CARRY_ELEM0:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I0]], 1 ; CHECK-NEXT: [[CARRY_ELEM1:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I1]], 1 -; CHECK-NEXT: [[CARRY_UPTO0:%.*]] = insertelement <2 x i1> poison, i1 [[CARRY_ELEM0]], i64 0 -; CHECK-NEXT: [[CARRY:%.*]] = insertelement <2 x i1> [[CARRY_UPTO0]], i1 [[CARRY_ELEM1]], i64 1 -; CHECK-NEXT: [[CARRY_I0:%.*]] = extractelement <2 x i1> [[CARRY]], i64 0 -; CHECK-NEXT: [[CARRY_I1:%.*]] = extractelement <2 x i1> [[CARRY]], i64 1 ; CHECK-NEXT: [[SUM_ELEM0:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I0]], 0 ; CHECK-NEXT: [[SUM_ELEM1:%.*]] = extractvalue [[DX_TYPES_I32C]] [[UADDC_I1]], 0 -; CHECK-NEXT: [[CARRY_ZEXT_I0:%.*]] = zext i1 [[CARRY_I0]] to i32 -; CHECK-NEXT: [[CARRY_ZEXT_I1:%.*]] = zext i1 [[CARRY_I1]] to i32 +; CHECK-NEXT: [[CARRY_ZEXT_I0:%.*]] = zext i1 [[CARRY_ELEM0]] to i32 +; CHECK-NEXT: [[CARRY_ZEXT_I1:%.*]] = zext i1 [[CARRY_ELEM1]] to i32 ; CHECK-NEXT: [[RESULT_I0:%.*]] = add i32 [[SUM_ELEM0]], [[CARRY_ZEXT_I0]] ; CHECK-NEXT: [[RESULT_I1:%.*]] = add i32 [[SUM_ELEM1]], [[CARRY_ZEXT_I1]] ; CHECK-NEXT: [[RESULT_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RESULT_I0]], i64 0 diff --git a/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll b/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll index f77df2d..6552ccd 100644 --- a/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll +++ b/llvm/test/CodeGen/DirectX/legalize-lifetimes-valver-1.6.ll @@ -1,6 +1,5 @@ ; RUN: opt -S -passes='dxil-op-lower' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-SM63 ; RUN: opt -S -passes='dxil-op-lower' -mtriple=dxil-pc-shadermodel6.6-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-SM66 -; RUN: opt -S -dxil-op-lower -dxil-prepare -mtriple=dxil-pc-shadermodel6.6-library %s | FileCheck %s --check-prefixes=CHECK,CHECK-PREPARE ; CHECK-LABEL: define void @test_legal_lifetime() { ; @@ -16,14 +15,6 @@ ; CHECK-SM66-NEXT: store i32 0, ptr [[GEP]], align 4 ; CHECK-SM66-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[ACCUM_I_FLAT]]) ; -; CHECK-PREPARE-NEXT: [[ACCUM_I_FLAT:%.*]] = alloca [1 x i32], align 4 -; CHECK-PREPARE-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ACCUM_I_FLAT]], i32 0 -; CHECK-PREPARE-NEXT: [[BITCAST:%.*]] = bitcast ptr [[ACCUM_I_FLAT]] to ptr -; CHECK-PREPARE-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[BITCAST]]) -; CHECK-PREPARE-NEXT: store i32 0, ptr [[GEP]], align 4 -; CHECK-PREPARE-NEXT: [[BITCAST:%.*]] = bitcast ptr [[ACCUM_I_FLAT]] to ptr -; CHECK-PREPARE-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[BITCAST]]) -; ; CHECK-NEXT: ret void ; define void @test_legal_lifetime() { @@ -35,22 +26,6 @@ define void @test_legal_lifetime() { ret void } -; CHECK-PREPARE-DAG: attributes [[LIFETIME_ATTRS:#.*]] = { nounwind } - -; CHECK-PREPARE-DAG: ; Function Attrs: nounwind -; CHECK-PREPARE-DAG: declare void @llvm.lifetime.start.p0(i64, ptr) [[LIFETIME_ATTRS]] - -; CHECK-PREPARE-DAG: ; Function Attrs: nounwind -; CHECK-PREPARE-DAG: declare void @llvm.lifetime.end.p0(i64, ptr) [[LIFETIME_ATTRS]] - -; Function Attrs: nounwind memory(argmem: readwrite) -declare void @llvm.lifetime.end.p0(i64, ptr) #0 - -; Function Attrs: nounwind memory(argmem: readwrite) -declare void @llvm.lifetime.start.p0(i64, ptr) #0 - -attributes #0 = { nounwind memory(argmem: readwrite) } - ; Set the validator version to 1.6 !dx.valver = !{!0} !0 = !{i32 1, i32 6} diff --git a/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir b/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir new file mode 100644 index 0000000..2960343 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir @@ -0,0 +1,50 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner %s -o /dev/null + +# Check that edges that violate topological order are not added to the +# SwingSchedulerDAG. This is a case where the crash was caused by PR 145878. + +--- | + target triple = "hexagon" + + define void @crash_145878() { + entry: + br label %loop + + loop: ; preds = %loop, %entry + %lsr.iv2 = phi i32 [ %lsr.iv.next, %loop ], [ 1, %entry ] + %lsr.iv = phi ptr [ %cgep3, %loop ], [ inttoptr (i32 -8 to ptr), %entry ] + %cgep = getelementptr i8, ptr %lsr.iv, i32 12 + %load = load i32, ptr %cgep, align 4 + store i32 %load, ptr %lsr.iv, align 4 + %lsr.iv.next = add nsw i32 %lsr.iv2, -1 + %iv.cmp.not = icmp eq i32 %lsr.iv.next, 0 + %cgep3 = getelementptr i8, ptr %lsr.iv, i32 -8 + br i1 %iv.cmp.not, label %exit, label %loop + + exit: ; preds = %loop + ret void + } +... +--- +name: crash_145878 +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + + %5:intregs = A2_tfrsi -8 + J2_loop0i %bb.1, 1, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + + bb.1.loop (machine-block-address-taken): + successors: %bb.2(0x04000000), %bb.1(0x7c000000) + + %1:intregs = PHI %5, %bb.0, %3, %bb.1 + %6:intregs = L2_loadri_io %1, 12 :: (load (s32) from %ir.cgep) + S2_storeri_io %1, 0, killed %6 :: (store (s32) into %ir.lsr.iv) + %3:intregs = A2_addi %1, -8 + ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def dead $pc + + bb.2.exit: + PS_jmpret $r31, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll index f25e988..61a915a 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll @@ -467,22 +467,21 @@ entry: define void @buildvector_v8f32(ptr %dst, float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind { ; CHECK-LABEL: buildvector_v8f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movfr2gr.s $a1, $fa0 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 0 -; CHECK-NEXT: movfr2gr.s $a1, $fa1 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 1 -; CHECK-NEXT: movfr2gr.s $a1, $fa2 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 2 -; CHECK-NEXT: movfr2gr.s $a1, $fa3 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 3 -; CHECK-NEXT: movfr2gr.s $a1, $fa4 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 4 -; CHECK-NEXT: movfr2gr.s $a1, $fa5 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 5 -; CHECK-NEXT: movfr2gr.s $a1, $fa6 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 6 -; CHECK-NEXT: movfr2gr.s $a1, $fa7 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 7 +; CHECK-NEXT: # kill: def $f7 killed $f7 def $xr7 +; CHECK-NEXT: # kill: def $f6 killed $f6 def $xr6 +; CHECK-NEXT: # kill: def $f5 killed $f5 def $xr5 +; CHECK-NEXT: # kill: def $f4 killed $f4 def $xr4 +; CHECK-NEXT: # kill: def $f3 killed $f3 def $xr3 +; CHECK-NEXT: # kill: def $f2 killed $f2 def $xr2 +; CHECK-NEXT: # kill: def $f1 killed $f1 def $xr1 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 +; CHECK-NEXT: xvinsve0.w $xr0, $xr1, 1 +; CHECK-NEXT: xvinsve0.w $xr0, $xr2, 2 +; CHECK-NEXT: xvinsve0.w $xr0, $xr3, 3 +; CHECK-NEXT: xvinsve0.w $xr0, $xr4, 4 +; CHECK-NEXT: xvinsve0.w $xr0, $xr5, 5 +; CHECK-NEXT: xvinsve0.w $xr0, $xr6, 6 +; CHECK-NEXT: xvinsve0.w $xr0, $xr7, 7 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -501,14 +500,13 @@ entry: define void @buildvector_v4f64(ptr %dst, double %a0, double %a1, double %a2, double %a3) nounwind { ; CHECK-LABEL: buildvector_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movfr2gr.d $a1, $fa0 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 0 -; CHECK-NEXT: movfr2gr.d $a1, $fa1 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 1 -; CHECK-NEXT: movfr2gr.d $a1, $fa2 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 2 -; CHECK-NEXT: movfr2gr.d $a1, $fa3 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 3 +; CHECK-NEXT: # kill: def $f3_64 killed $f3_64 def $xr3 +; CHECK-NEXT: # kill: def $f2_64 killed $f2_64 def $xr2 +; CHECK-NEXT: # kill: def $f1_64 killed $f1_64 def $xr1 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; CHECK-NEXT: xvinsve0.d $xr0, $xr1, 1 +; CHECK-NEXT: xvinsve0.d $xr0, $xr2, 2 +; CHECK-NEXT: xvinsve0.d $xr0, $xr3, 3 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll index 9528280..3800712 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll @@ -11,23 +11,22 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind { ; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill ; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill ; CHECK-NEXT: addi.w $fp, $a0, 0 -; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 1 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 0 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 ; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill ; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 1 +; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 0 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 +; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; CHECK-NEXT: xvinsve0.w $xr0, $xr1, 1 ; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill ; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 2 @@ -35,59 +34,60 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind { ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 2 -; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 +; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 2 +; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill ; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 3 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 3 -; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 +; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 3 +; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill ; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 4 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 4 -; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 +; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 4 +; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill ; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 5 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 5 -; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 +; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 5 +; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill ; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 6 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 6 -; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 +; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 6 +; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill ; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 7 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 +; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 7 +; CHECK-NEXT: xvori.b $xr0, $xr1, 0 ; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload ; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload ; CHECK-NEXT: addi.d $sp, $sp, 96 @@ -105,45 +105,45 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind { ; CHECK-NEXT: addi.d $sp, $sp, -96 ; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill ; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill -; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill ; CHECK-NEXT: addi.w $fp, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 ; CHECK-NEXT: movgr2fr.d $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload +; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 ; CHECK-NEXT: movgr2fr.d $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; CHECK-NEXT: xvinsve0.d $xr0, $xr1, 1 +; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill ; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1 -; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2 ; CHECK-NEXT: movgr2fr.d $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; CHECK-NEXT: xvinsve0.d $xr1, $xr0, 2 +; CHECK-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill ; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 2 -; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 ; CHECK-NEXT: movgr2fr.d $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 3 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; CHECK-NEXT: xvinsve0.d $xr1, $xr0, 3 +; CHECK-NEXT: xvori.b $xr0, $xr1, 0 ; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload ; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload ; CHECK-NEXT: addi.d $sp, $sp, 96 diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll index f154dd3..221aba3 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll @@ -6,15 +6,12 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: shufflevector_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 0 ; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 2 -; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 1 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 2 +; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 3 +; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1 +; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 2 ; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 3 -; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 3 -; CHECK-NEXT: xvori.b $xr0, $xr2, 0 +; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 3 ; CHECK-NEXT: ret entry: %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 6, i32 3, i32 7> diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll index b24f95e..c1d4220 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll @@ -87,8 +87,8 @@ define void @insert_8xfloat(ptr %src, ptr %dst, float %in) nounwind { ; CHECK-LABEL: insert_8xfloat: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvinsgr2vr.w $xr1, $a0, 1 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 +; CHECK-NEXT: xvinsve0.w $xr1, $xr0, 1 ; CHECK-NEXT: xvst $xr1, $a1, 0 ; CHECK-NEXT: ret %v = load volatile <8 x float>, ptr %src @@ -101,8 +101,8 @@ define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind { ; CHECK-LABEL: insert_4xdouble: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 -; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; CHECK-NEXT: xvinsve0.d $xr1, $xr0, 1 ; CHECK-NEXT: xvst $xr1, $a1, 0 ; CHECK-NEXT: ret %v = load volatile <4 x double>, ptr %src diff --git a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll index 7a52531..62ea5cb 100644 --- a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll +++ b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll @@ -196,21 +196,20 @@ define <2 x double> @exp10_v2f64(<2 x double> %x) #0 { ; LA64-NEXT: addi.d $sp, $sp, -48 ; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill ; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 +; LA64-NEXT: vreplvei.d $vr0, $vr0, 1 ; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(exp10) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: movfr2gr.d $a0, $fa0 -; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 ; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill ; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload -; LA64-NEXT: vreplvei.d $vr0, $vr0, 1 +; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 ; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(exp10) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: movfr2gr.d $a0, $fa0 -; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 1 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: vextrins.d $vr0, $vr1, 16 ; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 48 ; LA64-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll index 648c19d..383d63c 100644 --- a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll +++ b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll @@ -571,39 +571,37 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) #0 { ; LA64-NEXT: addi.d $sp, $sp, -80 ; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill ; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 -; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA64-NEXT: vreplvei.d $vr0, $vr0, 1 +; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill ; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(sin) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: movfr2gr.d $a0, $fa0 -; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill ; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: vreplvei.d $vr0, $vr0, 1 +; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 ; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill ; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(sin) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: movfr2gr.d $a0, $fa0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload +; LA64-NEXT: vextrins.d $vr0, $vr1, 16 +; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill ; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload -; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 1 -; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload ; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(cos) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: movfr2gr.d $a0, $fa0 -; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill ; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload ; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(cos) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: movfr2gr.d $a0, $fa0 -; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload -; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: fmov.d $fa1, $fa0 ; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload +; LA64-NEXT: vextrins.d $vr1, $vr0, 16 +; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 80 ; LA64-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll index d84e408..afc87d1 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll @@ -334,14 +334,13 @@ entry: define void @buildvector_v4f32(ptr %dst, float %a0, float %a1, float %a2, float %a3) nounwind { ; CHECK-LABEL: buildvector_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movfr2gr.s $a1, $fa0 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a1, 0 -; CHECK-NEXT: movfr2gr.s $a1, $fa1 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a1, 1 -; CHECK-NEXT: movfr2gr.s $a1, $fa2 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a1, 2 -; CHECK-NEXT: movfr2gr.s $a1, $fa3 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a1, 3 +; CHECK-NEXT: # kill: def $f3 killed $f3 def $vr3 +; CHECK-NEXT: # kill: def $f2 killed $f2 def $vr2 +; CHECK-NEXT: # kill: def $f1 killed $f1 def $vr1 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 +; CHECK-NEXT: vextrins.w $vr0, $vr1, 16 +; CHECK-NEXT: vextrins.w $vr0, $vr2, 32 +; CHECK-NEXT: vextrins.w $vr0, $vr3, 48 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -356,10 +355,9 @@ entry: define void @buildvector_v2f64(ptr %dst, double %a0, double %a1) nounwind { ; CHECK-LABEL: buildvector_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movfr2gr.d $a1, $fa0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 0 -; CHECK-NEXT: movfr2gr.d $a1, $fa1 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 1 +; CHECK-NEXT: # kill: def $f1_64 killed $f1_64 def $vr1 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll index aafef07..735dad4 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/fpowi.ll @@ -9,45 +9,45 @@ define <4 x float> @powi_v4f32(<4 x float> %va, i32 %b) nounwind { ; CHECK-NEXT: addi.d $sp, $sp, -48 ; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill ; CHECK-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill -; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill ; CHECK-NEXT: addi.w $fp, $a0, 0 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 1 ; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 -; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 1 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 +; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 +; CHECK-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; CHECK-NEXT: vextrins.w $vr0, $vr1, 16 +; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill ; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 1 -; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload ; CHECK-NEXT: vreplvei.w $vr0, $vr0, 2 ; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 +; CHECK-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; CHECK-NEXT: vextrins.w $vr1, $vr0, 32 +; CHECK-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill ; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 2 -; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload ; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 ; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 3 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 +; CHECK-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload +; CHECK-NEXT: vextrins.w $vr1, $vr0, 48 +; CHECK-NEXT: vori.b $vr0, $vr1, 0 ; CHECK-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload ; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload ; CHECK-NEXT: addi.d $sp, $sp, 48 @@ -67,23 +67,22 @@ define <2 x double> @powi_v2f64(<2 x double> %va, i32 %b) nounwind { ; CHECK-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill ; CHECK-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill ; CHECK-NEXT: addi.w $fp, $a0, 0 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 1 ; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 ; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill ; CHECK-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 ; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 -; CHECK-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 1 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; CHECK-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 ; CHECK-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload ; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload ; CHECK-NEXT: addi.d $sp, $sp, 48 diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll index 7f23207..c73252b 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll @@ -57,8 +57,8 @@ define void @insert_4xfloat(ptr %src, ptr %dst, float %ins) nounwind { ; CHECK-LABEL: insert_4xfloat: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr1, $a0, 0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 +; CHECK-NEXT: vextrins.w $vr1, $vr0, 16 ; CHECK-NEXT: vst $vr1, $a1, 0 ; CHECK-NEXT: ret %v = load volatile <4 x float>, ptr %src @@ -71,8 +71,8 @@ define void @insert_2xdouble(ptr %src, ptr %dst, double %ins) nounwind { ; CHECK-LABEL: insert_2xdouble: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr1, $a0, 0 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; CHECK-NEXT: vextrins.d $vr1, $vr0, 16 ; CHECK-NEXT: vst $vr1, $a1, 0 ; CHECK-NEXT: ret %v = load volatile <2 x double>, ptr %src diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll index 0ee3012..ad57bbf 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll @@ -588,3 +588,18 @@ define i2 @vmsk_trunc_i64(<2 x i64> %a) { %res = bitcast <2 x i1> %y to i2 ret i2 %res } + +define i4 @vmsk_eq_allzeros_v4i8(<4 x i8> %a) { +; CHECK-LABEL: vmsk_eq_allzeros_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vseqi.b $vr0, $vr0, 0 +; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0 +; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 +; CHECK-NEXT: vslli.w $vr0, $vr0, 24 +; CHECK-NEXT: vmskltz.w $vr0, $vr0 +; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; CHECK-NEXT: ret + %1 = icmp eq <4 x i8> %a, zeroinitializer + %2 = bitcast <4 x i1> %1 to i4 + ret i4 %2 +} diff --git a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll index eb656ad..6e9d26a 100644 --- a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll +++ b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll @@ -24,9 +24,9 @@ ; NO-WARNING-NOT: warning: triple-implied ABI conflicts with provided target-abi 'lp64d', using target-abi ;; Check that ILP32-on-LA64 and LP64-on-LA32 combinations are handled properly. -; RUN: llc --mtriple=loongarch64 --target-abi=ilp32d --mattr=+d < %s 2>&1 \ +; RUN: llc --mtriple=loongarch64-linux-gnu --target-abi=ilp32d --mattr=+d < %s 2>&1 \ ; RUN: | FileCheck %s --check-prefixes=LP64D,32ON64 -; RUN: llc --mtriple=loongarch32 --target-abi=lp64d --mattr=+d < %s 2>&1 \ +; RUN: llc --mtriple=loongarch32-linux-gnu --target-abi=lp64d --mattr=+d < %s 2>&1 \ ; RUN: | FileCheck %s --check-prefixes=ILP32D,64ON32 ; 32ON64: warning: 32-bit ABIs are not supported for 64-bit targets, ignoring and using triple-implied ABI @@ -49,12 +49,6 @@ ; LP64D-LP64F-NOF: warning: both target-abi and the triple-implied ABI are invalid, ignoring and using feature-implied ABI -;; Check that triple-implied ABI are invalid, use feature-implied ABI -; RUN: llc --mtriple=loongarch64 --mattr=-f < %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=LP64S,LP64D-NONE-NOF - -; LP64D-NONE-NOF: warning: the triple-implied ABI is invalid, ignoring and using feature-implied ABI - define float @f(float %a) { ; ILP32D-LABEL: f: ; ILP32D: # %bb.0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll index b514c493..278cf01 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 { entry: @@ -315,6 +316,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 { entry: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll index fc730f9..890ea44 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -46,6 +46,7 @@ ; AFTER-PEI-NEXT: hasInitWholeWave: false ; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0 ; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0 +; AFTER-PEI-NEXT: isWholeWaveFunction: false ; AFTER-PEI-NEXT: body: define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { %wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index 5adef14..f84ef8a 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 { bb0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index fa40164..cc834d0 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 { bb0: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index 24565e4..06c580e 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -55,6 +55,7 @@ # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -162,6 +163,7 @@ body: | # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -240,6 +242,7 @@ body: | # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -319,6 +322,7 @@ body: | # FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: dynamicVGPRBlockSize: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0 +# FULL-NEXT: isWholeWaveFunction: false # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index a152713..4271546 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -56,6 +56,7 @@ ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0 @@ -105,6 +106,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0 @@ -178,6 +180,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 { ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define void @function() { ret void @@ -233,6 +236,7 @@ define void @function() { ; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: dynamicVGPRBlockSize: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 +; CHECK-NEXT: isWholeWaveFunction: false ; CHECK-NEXT: body: define void @function_nsz() #0 { ret void diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll index f1adc34..9a051b3 100644 --- a/llvm/test/CodeGen/NVPTX/i1-select.ll +++ b/llvm/test/CodeGen/NVPTX/i1-select.ll @@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) { ; CHECK-LABEL: test_select_i1_basic_folding( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<12>; -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .pred %p<13>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0]; ; CHECK-NEXT: setp.eq.b32 %p1, %r1, 0; -; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_1]; -; CHECK-NEXT: setp.ne.b32 %p2, %r3, 0; -; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_2]; -; CHECK-NEXT: setp.eq.b32 %p4, %r5, 0; -; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_folding_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_folding_param_1]; +; CHECK-NEXT: setp.ne.b32 %p2, %r2, 0; +; CHECK-NEXT: setp.eq.b32 %p3, %r2, 0; +; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2]; +; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0; +; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3]; ; CHECK-NEXT: xor.pred %p6, %p1, %p3; -; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_folding_param_4]; +; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4]; ; CHECK-NEXT: and.pred %p7, %p6, %p4; -; CHECK-NEXT: and.pred %p8, %p2, %p4; -; CHECK-NEXT: and.pred %p9, %p3, %p7; -; CHECK-NEXT: or.pred %p10, %p9, %p8; -; CHECK-NEXT: xor.pred %p11, %p10, %p3; -; CHECK-NEXT: selp.b32 %r8, %r6, %r7, %p11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: and.pred %p9, %p2, %p4; +; CHECK-NEXT: and.pred %p10, %p3, %p7; +; CHECK-NEXT: or.pred %p11, %p10, %p9; +; CHECK-NEXT: xor.pred %p12, %p11, %p3; +; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 %b2 = icmp eq i32 %v2, 0 diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index f2211eb..44d8558 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -5,9 +5,9 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: srem_i128( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<22>; +; CHECK-NEXT: .reg .pred %p<20>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<126>; +; CHECK-NEXT: .reg .b64 %rd<127>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0]; @@ -42,103 +42,102 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd62, %r4; ; CHECK-NEXT: add.s64 %rd63, %rd62, 64; ; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7; -; CHECK-NEXT: mov.b64 %rd116, 0; +; CHECK-NEXT: mov.b64 %rd117, 0; ; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64; -; CHECK-NEXT: subc.cc.s64 %rd8, %rd116, 0; -; CHECK-NEXT: setp.ne.b64 %p8, %rd8, 0; -; CHECK-NEXT: and.pred %p10, %p8, %p8; -; CHECK-NEXT: setp.eq.b64 %p11, %rd8, 0; -; CHECK-NEXT: setp.gt.u64 %p12, %rd66, 127; -; CHECK-NEXT: and.pred %p13, %p11, %p12; -; CHECK-NEXT: or.pred %p14, %p13, %p10; -; CHECK-NEXT: or.pred %p15, %p5, %p14; -; CHECK-NEXT: xor.b64 %rd67, %rd66, 127; -; CHECK-NEXT: or.b64 %rd68, %rd67, %rd8; -; CHECK-NEXT: setp.eq.b64 %p16, %rd68, 0; -; CHECK-NEXT: selp.b64 %rd125, 0, %rd4, %p15; -; CHECK-NEXT: selp.b64 %rd124, 0, %rd3, %p15; -; CHECK-NEXT: or.pred %p17, %p15, %p16; -; CHECK-NEXT: @%p17 bra $L__BB0_5; +; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0; +; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127; +; CHECK-NEXT: setp.eq.b64 %p9, %rd67, 0; +; CHECK-NEXT: and.pred %p10, %p9, %p8; +; CHECK-NEXT: setp.ne.b64 %p11, %rd67, 0; +; CHECK-NEXT: or.pred %p12, %p10, %p11; +; CHECK-NEXT: or.pred %p13, %p5, %p12; +; CHECK-NEXT: xor.b64 %rd68, %rd66, 127; +; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67; +; CHECK-NEXT: setp.eq.b64 %p14, %rd69, 0; +; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p13; +; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p13; +; CHECK-NEXT: or.pred %p15, %p13, %p14; +; CHECK-NEXT: @%p15 bra $L__BB0_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd118, %rd66, 1; -; CHECK-NEXT: addc.cc.s64 %rd119, %rd8, 0; -; CHECK-NEXT: or.b64 %rd71, %rd118, %rd119; -; CHECK-NEXT: setp.eq.b64 %p18, %rd71, 0; +; CHECK-NEXT: add.cc.s64 %rd119, %rd66, 1; +; CHECK-NEXT: addc.cc.s64 %rd120, %rd67, 0; +; CHECK-NEXT: or.b64 %rd72, %rd119, %rd120; +; CHECK-NEXT: setp.eq.b64 %p16, %rd72, 0; ; CHECK-NEXT: cvt.u32.u64 %r5, %rd66; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd72, %rd4, %r6; +; CHECK-NEXT: shl.b64 %rd73, %rd4, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd73, %rd3, %r7; -; CHECK-NEXT: or.b64 %rd74, %rd72, %rd73; +; CHECK-NEXT: shr.u64 %rd74, %rd3, %r7; +; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd75, %rd3, %r8; -; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63; -; CHECK-NEXT: selp.b64 %rd123, %rd75, %rd74, %p19; -; CHECK-NEXT: shl.b64 %rd122, %rd3, %r6; -; CHECK-NEXT: mov.b64 %rd113, %rd116; -; CHECK-NEXT: @%p18 bra $L__BB0_4; +; CHECK-NEXT: shl.b64 %rd76, %rd3, %r8; +; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; +; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p17; +; CHECK-NEXT: shl.b64 %rd123, %rd3, %r6; +; CHECK-NEXT: mov.b64 %rd114, %rd117; +; CHECK-NEXT: @%p16 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd118; -; CHECK-NEXT: shr.u64 %rd78, %rd3, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd119; +; CHECK-NEXT: shr.u64 %rd79, %rd3, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd79, %rd4, %r10; -; CHECK-NEXT: or.b64 %rd80, %rd78, %rd79; +; CHECK-NEXT: shl.b64 %rd80, %rd4, %r10; +; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd81, %rd4, %r11; -; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63; -; CHECK-NEXT: selp.b64 %rd120, %rd81, %rd80, %p20; -; CHECK-NEXT: shr.u64 %rd121, %rd4, %r9; +; CHECK-NEXT: shr.u64 %rd82, %rd4, %r11; +; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63; +; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p18; +; CHECK-NEXT: shr.u64 %rd122, %rd4, %r9; ; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1; -; CHECK-NEXT: mov.b64 %rd113, 0; -; CHECK-NEXT: mov.b64 %rd116, %rd113; +; CHECK-NEXT: mov.b64 %rd114, 0; +; CHECK-NEXT: mov.b64 %rd117, %rd114; ; CHECK-NEXT: $L__BB0_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd82, %rd120, 63; -; CHECK-NEXT: shl.b64 %rd83, %rd121, 1; -; CHECK-NEXT: or.b64 %rd84, %rd83, %rd82; -; CHECK-NEXT: shl.b64 %rd85, %rd120, 1; -; CHECK-NEXT: shr.u64 %rd86, %rd123, 63; -; CHECK-NEXT: or.b64 %rd87, %rd85, %rd86; -; CHECK-NEXT: shr.u64 %rd88, %rd122, 63; -; CHECK-NEXT: shl.b64 %rd89, %rd123, 1; -; CHECK-NEXT: or.b64 %rd90, %rd89, %rd88; -; CHECK-NEXT: shl.b64 %rd91, %rd122, 1; -; CHECK-NEXT: or.b64 %rd122, %rd116, %rd91; -; CHECK-NEXT: or.b64 %rd123, %rd113, %rd90; -; CHECK-NEXT: sub.cc.s64 %rd92, %rd35, %rd87; -; CHECK-NEXT: subc.cc.s64 %rd93, %rd36, %rd84; -; CHECK-NEXT: shr.s64 %rd94, %rd93, 63; -; CHECK-NEXT: and.b64 %rd116, %rd94, 1; -; CHECK-NEXT: and.b64 %rd95, %rd94, %rd5; -; CHECK-NEXT: and.b64 %rd96, %rd94, %rd6; -; CHECK-NEXT: sub.cc.s64 %rd120, %rd87, %rd95; -; CHECK-NEXT: subc.cc.s64 %rd121, %rd84, %rd96; -; CHECK-NEXT: add.cc.s64 %rd118, %rd118, -1; -; CHECK-NEXT: addc.cc.s64 %rd119, %rd119, -1; -; CHECK-NEXT: or.b64 %rd97, %rd118, %rd119; -; CHECK-NEXT: setp.eq.b64 %p21, %rd97, 0; -; CHECK-NEXT: @%p21 bra $L__BB0_4; +; CHECK-NEXT: shr.u64 %rd83, %rd121, 63; +; CHECK-NEXT: shl.b64 %rd84, %rd122, 1; +; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83; +; CHECK-NEXT: shl.b64 %rd86, %rd121, 1; +; CHECK-NEXT: shr.u64 %rd87, %rd124, 63; +; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87; +; CHECK-NEXT: shr.u64 %rd89, %rd123, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd124, 1; +; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; +; CHECK-NEXT: shl.b64 %rd92, %rd123, 1; +; CHECK-NEXT: or.b64 %rd123, %rd117, %rd92; +; CHECK-NEXT: or.b64 %rd124, %rd114, %rd91; +; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88; +; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85; +; CHECK-NEXT: shr.s64 %rd95, %rd94, 63; +; CHECK-NEXT: and.b64 %rd117, %rd95, 1; +; CHECK-NEXT: and.b64 %rd96, %rd95, %rd5; +; CHECK-NEXT: and.b64 %rd97, %rd95, %rd6; +; CHECK-NEXT: sub.cc.s64 %rd121, %rd88, %rd96; +; CHECK-NEXT: subc.cc.s64 %rd122, %rd85, %rd97; +; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1; +; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1; +; CHECK-NEXT: or.b64 %rd98, %rd119, %rd120; +; CHECK-NEXT: setp.eq.b64 %p19, %rd98, 0; +; CHECK-NEXT: @%p19 bra $L__BB0_4; ; CHECK-NEXT: bra.uni $L__BB0_2; ; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd98, %rd122, 63; -; CHECK-NEXT: shl.b64 %rd99, %rd123, 1; -; CHECK-NEXT: or.b64 %rd100, %rd99, %rd98; -; CHECK-NEXT: shl.b64 %rd101, %rd122, 1; -; CHECK-NEXT: or.b64 %rd124, %rd116, %rd101; -; CHECK-NEXT: or.b64 %rd125, %rd113, %rd100; +; CHECK-NEXT: shr.u64 %rd99, %rd123, 63; +; CHECK-NEXT: shl.b64 %rd100, %rd124, 1; +; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99; +; CHECK-NEXT: shl.b64 %rd102, %rd123, 1; +; CHECK-NEXT: or.b64 %rd125, %rd117, %rd102; +; CHECK-NEXT: or.b64 %rd126, %rd114, %rd101; ; CHECK-NEXT: $L__BB0_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd102, %rd5, %rd124; -; CHECK-NEXT: mad.lo.s64 %rd103, %rd5, %rd125, %rd102; -; CHECK-NEXT: mad.lo.s64 %rd104, %rd6, %rd124, %rd103; -; CHECK-NEXT: mul.lo.s64 %rd105, %rd5, %rd124; -; CHECK-NEXT: sub.cc.s64 %rd106, %rd3, %rd105; -; CHECK-NEXT: subc.cc.s64 %rd107, %rd4, %rd104; -; CHECK-NEXT: xor.b64 %rd108, %rd106, %rd2; +; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd125; +; CHECK-NEXT: mad.lo.s64 %rd104, %rd5, %rd126, %rd103; +; CHECK-NEXT: mad.lo.s64 %rd105, %rd6, %rd125, %rd104; +; CHECK-NEXT: mul.lo.s64 %rd106, %rd5, %rd125; +; CHECK-NEXT: sub.cc.s64 %rd107, %rd3, %rd106; +; CHECK-NEXT: subc.cc.s64 %rd108, %rd4, %rd105; ; CHECK-NEXT: xor.b64 %rd109, %rd107, %rd2; -; CHECK-NEXT: sub.cc.s64 %rd110, %rd108, %rd2; -; CHECK-NEXT: subc.cc.s64 %rd111, %rd109, %rd2; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd110, %rd111}; +; CHECK-NEXT: xor.b64 %rd110, %rd108, %rd2; +; CHECK-NEXT: sub.cc.s64 %rd111, %rd109, %rd2; +; CHECK-NEXT: subc.cc.s64 %rd112, %rd110, %rd2; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd111, %rd112}; ; CHECK-NEXT: ret; %div = srem i128 %lhs, %rhs ret i128 %div @@ -149,7 +148,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<18>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<111>; +; CHECK-NEXT: .reg .b64 %rd<113>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0]; @@ -173,98 +172,98 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd101, 0; -; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd6, %rd101, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127; -; CHECK-NEXT: setp.eq.b64 %p7, %rd6, 0; +; CHECK-NEXT: mov.b64 %rd103, 0; +; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; +; CHECK-NEXT: subc.cc.s64 %rd57, %rd103, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127; +; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.b64 %p9, %rd6, 0; +; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd56, %rd5, 127; -; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6; -; CHECK-NEXT: setp.eq.b64 %p12, %rd57, 0; -; CHECK-NEXT: selp.b64 %rd110, 0, %rd42, %p11; -; CHECK-NEXT: selp.b64 %rd109, 0, %rd41, %p11; +; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; +; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; +; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0; +; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p11; +; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB1_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd103, %rd5, 1; -; CHECK-NEXT: addc.cc.s64 %rd104, %rd6, 0; -; CHECK-NEXT: or.b64 %rd60, %rd103, %rd104; -; CHECK-NEXT: setp.eq.b64 %p14, %rd60, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd5; +; CHECK-NEXT: add.cc.s64 %rd105, %rd56, 1; +; CHECK-NEXT: addc.cc.s64 %rd106, %rd57, 0; +; CHECK-NEXT: or.b64 %rd62, %rd105, %rd106; +; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd56; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6; +; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7; -; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62; +; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7; +; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8; +; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd108, %rd64, %rd63, %p15; -; CHECK-NEXT: shl.b64 %rd107, %rd41, %r6; -; CHECK-NEXT: mov.b64 %rd98, %rd101; +; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p15; +; CHECK-NEXT: shl.b64 %rd109, %rd41, %r6; +; CHECK-NEXT: mov.b64 %rd100, %rd103; ; CHECK-NEXT: @%p14 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd103; -; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd105; +; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10; -; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68; +; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10; +; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11; +; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; -; CHECK-NEXT: selp.b64 %rd105, %rd70, %rd69, %p16; -; CHECK-NEXT: shr.u64 %rd106, %rd42, %r9; +; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p16; +; CHECK-NEXT: shr.u64 %rd108, %rd42, %r9; ; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd98, 0; -; CHECK-NEXT: mov.b64 %rd101, %rd98; +; CHECK-NEXT: mov.b64 %rd100, 0; +; CHECK-NEXT: mov.b64 %rd103, %rd100; ; CHECK-NEXT: $L__BB1_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd71, %rd105, 63; -; CHECK-NEXT: shl.b64 %rd72, %rd106, 1; -; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71; -; CHECK-NEXT: shl.b64 %rd74, %rd105, 1; -; CHECK-NEXT: shr.u64 %rd75, %rd108, 63; -; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; -; CHECK-NEXT: shr.u64 %rd77, %rd107, 63; -; CHECK-NEXT: shl.b64 %rd78, %rd108, 1; -; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77; -; CHECK-NEXT: shl.b64 %rd80, %rd107, 1; -; CHECK-NEXT: or.b64 %rd107, %rd101, %rd80; -; CHECK-NEXT: or.b64 %rd108, %rd98, %rd79; -; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76; -; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73; -; CHECK-NEXT: shr.s64 %rd83, %rd82, 63; -; CHECK-NEXT: and.b64 %rd101, %rd83, 1; -; CHECK-NEXT: and.b64 %rd84, %rd83, %rd3; -; CHECK-NEXT: and.b64 %rd85, %rd83, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd105, %rd76, %rd84; -; CHECK-NEXT: subc.cc.s64 %rd106, %rd73, %rd85; -; CHECK-NEXT: add.cc.s64 %rd103, %rd103, -1; -; CHECK-NEXT: addc.cc.s64 %rd104, %rd104, -1; -; CHECK-NEXT: or.b64 %rd86, %rd103, %rd104; -; CHECK-NEXT: setp.eq.b64 %p17, %rd86, 0; +; CHECK-NEXT: shr.u64 %rd73, %rd107, 63; +; CHECK-NEXT: shl.b64 %rd74, %rd108, 1; +; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; +; CHECK-NEXT: shl.b64 %rd76, %rd107, 1; +; CHECK-NEXT: shr.u64 %rd77, %rd110, 63; +; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; +; CHECK-NEXT: shr.u64 %rd79, %rd109, 63; +; CHECK-NEXT: shl.b64 %rd80, %rd110, 1; +; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; +; CHECK-NEXT: shl.b64 %rd82, %rd109, 1; +; CHECK-NEXT: or.b64 %rd109, %rd103, %rd82; +; CHECK-NEXT: or.b64 %rd110, %rd100, %rd81; +; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; +; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; +; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; +; CHECK-NEXT: and.b64 %rd103, %rd85, 1; +; CHECK-NEXT: and.b64 %rd86, %rd85, %rd3; +; CHECK-NEXT: and.b64 %rd87, %rd85, %rd4; +; CHECK-NEXT: sub.cc.s64 %rd107, %rd78, %rd86; +; CHECK-NEXT: subc.cc.s64 %rd108, %rd75, %rd87; +; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1; +; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1; +; CHECK-NEXT: or.b64 %rd88, %rd105, %rd106; +; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0; ; CHECK-NEXT: @%p17 bra $L__BB1_4; ; CHECK-NEXT: bra.uni $L__BB1_2; ; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd87, %rd107, 63; -; CHECK-NEXT: shl.b64 %rd88, %rd108, 1; -; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87; -; CHECK-NEXT: shl.b64 %rd90, %rd107, 1; -; CHECK-NEXT: or.b64 %rd109, %rd101, %rd90; -; CHECK-NEXT: or.b64 %rd110, %rd98, %rd89; +; CHECK-NEXT: shr.u64 %rd89, %rd109, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd110, 1; +; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; +; CHECK-NEXT: shl.b64 %rd92, %rd109, 1; +; CHECK-NEXT: or.b64 %rd111, %rd103, %rd92; +; CHECK-NEXT: or.b64 %rd112, %rd100, %rd91; ; CHECK-NEXT: $L__BB1_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd91, %rd3, %rd109; -; CHECK-NEXT: mad.lo.s64 %rd92, %rd3, %rd110, %rd91; -; CHECK-NEXT: mad.lo.s64 %rd93, %rd4, %rd109, %rd92; -; CHECK-NEXT: mul.lo.s64 %rd94, %rd3, %rd109; -; CHECK-NEXT: sub.cc.s64 %rd95, %rd41, %rd94; -; CHECK-NEXT: subc.cc.s64 %rd96, %rd42, %rd93; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd95, %rd96}; +; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd111; +; CHECK-NEXT: mad.lo.s64 %rd94, %rd3, %rd112, %rd93; +; CHECK-NEXT: mad.lo.s64 %rd95, %rd4, %rd111, %rd94; +; CHECK-NEXT: mul.lo.s64 %rd96, %rd3, %rd111; +; CHECK-NEXT: sub.cc.s64 %rd97, %rd41, %rd96; +; CHECK-NEXT: subc.cc.s64 %rd98, %rd42, %rd95; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd97, %rd98}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, %rhs ret i128 %div @@ -307,9 +306,9 @@ define i128 @urem_i128_pow2k(i128 %lhs) { define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: sdiv_i128( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<22>; +; CHECK-NEXT: .reg .pred %p<20>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<121>; +; CHECK-NEXT: .reg .b64 %rd<122>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0]; @@ -345,97 +344,96 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd63, %r4; ; CHECK-NEXT: add.s64 %rd64, %rd63, 64; ; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; -; CHECK-NEXT: mov.b64 %rd111, 0; +; CHECK-NEXT: mov.b64 %rd112, 0; ; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65; -; CHECK-NEXT: subc.cc.s64 %rd8, %rd111, 0; -; CHECK-NEXT: setp.ne.b64 %p8, %rd8, 0; -; CHECK-NEXT: and.pred %p10, %p8, %p8; -; CHECK-NEXT: setp.eq.b64 %p11, %rd8, 0; -; CHECK-NEXT: setp.gt.u64 %p12, %rd67, 127; -; CHECK-NEXT: and.pred %p13, %p11, %p12; -; CHECK-NEXT: or.pred %p14, %p13, %p10; -; CHECK-NEXT: or.pred %p15, %p5, %p14; -; CHECK-NEXT: xor.b64 %rd68, %rd67, 127; -; CHECK-NEXT: or.b64 %rd69, %rd68, %rd8; -; CHECK-NEXT: setp.eq.b64 %p16, %rd69, 0; -; CHECK-NEXT: selp.b64 %rd120, 0, %rd2, %p15; -; CHECK-NEXT: selp.b64 %rd119, 0, %rd1, %p15; -; CHECK-NEXT: or.pred %p17, %p15, %p16; -; CHECK-NEXT: @%p17 bra $L__BB4_5; +; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0; +; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127; +; CHECK-NEXT: setp.eq.b64 %p9, %rd68, 0; +; CHECK-NEXT: and.pred %p10, %p9, %p8; +; CHECK-NEXT: setp.ne.b64 %p11, %rd68, 0; +; CHECK-NEXT: or.pred %p12, %p10, %p11; +; CHECK-NEXT: or.pred %p13, %p5, %p12; +; CHECK-NEXT: xor.b64 %rd69, %rd67, 127; +; CHECK-NEXT: or.b64 %rd70, %rd69, %rd68; +; CHECK-NEXT: setp.eq.b64 %p14, %rd70, 0; +; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p13; +; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p13; +; CHECK-NEXT: or.pred %p15, %p13, %p14; +; CHECK-NEXT: @%p15 bra $L__BB4_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd113, %rd67, 1; -; CHECK-NEXT: addc.cc.s64 %rd114, %rd8, 0; -; CHECK-NEXT: or.b64 %rd72, %rd113, %rd114; -; CHECK-NEXT: setp.eq.b64 %p18, %rd72, 0; +; CHECK-NEXT: add.cc.s64 %rd114, %rd67, 1; +; CHECK-NEXT: addc.cc.s64 %rd115, %rd68, 0; +; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115; +; CHECK-NEXT: setp.eq.b64 %p16, %rd73, 0; ; CHECK-NEXT: cvt.u32.u64 %r5, %rd67; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd73, %rd2, %r6; +; CHECK-NEXT: shl.b64 %rd74, %rd2, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd74, %rd1, %r7; -; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; +; CHECK-NEXT: shr.u64 %rd75, %rd1, %r7; +; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd76, %rd1, %r8; -; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63; -; CHECK-NEXT: selp.b64 %rd118, %rd76, %rd75, %p19; -; CHECK-NEXT: shl.b64 %rd117, %rd1, %r6; -; CHECK-NEXT: mov.b64 %rd108, %rd111; -; CHECK-NEXT: @%p18 bra $L__BB4_4; +; CHECK-NEXT: shl.b64 %rd77, %rd1, %r8; +; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; +; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p17; +; CHECK-NEXT: shl.b64 %rd118, %rd1, %r6; +; CHECK-NEXT: mov.b64 %rd109, %rd112; +; CHECK-NEXT: @%p16 bra $L__BB4_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd113; -; CHECK-NEXT: shr.u64 %rd79, %rd1, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd114; +; CHECK-NEXT: shr.u64 %rd80, %rd1, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd80, %rd2, %r10; -; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; +; CHECK-NEXT: shl.b64 %rd81, %rd2, %r10; +; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd82, %rd2, %r11; -; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63; -; CHECK-NEXT: selp.b64 %rd115, %rd82, %rd81, %p20; -; CHECK-NEXT: shr.u64 %rd116, %rd2, %r9; +; CHECK-NEXT: shr.u64 %rd83, %rd2, %r11; +; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63; +; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p18; +; CHECK-NEXT: shr.u64 %rd117, %rd2, %r9; ; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd108, 0; -; CHECK-NEXT: mov.b64 %rd111, %rd108; +; CHECK-NEXT: mov.b64 %rd109, 0; +; CHECK-NEXT: mov.b64 %rd112, %rd109; ; CHECK-NEXT: $L__BB4_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd83, %rd115, 63; -; CHECK-NEXT: shl.b64 %rd84, %rd116, 1; -; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83; -; CHECK-NEXT: shl.b64 %rd86, %rd115, 1; -; CHECK-NEXT: shr.u64 %rd87, %rd118, 63; -; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87; -; CHECK-NEXT: shr.u64 %rd89, %rd117, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd118, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd117, 1; -; CHECK-NEXT: or.b64 %rd117, %rd111, %rd92; -; CHECK-NEXT: or.b64 %rd118, %rd108, %rd91; -; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88; -; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85; -; CHECK-NEXT: shr.s64 %rd95, %rd94, 63; -; CHECK-NEXT: and.b64 %rd111, %rd95, 1; -; CHECK-NEXT: and.b64 %rd96, %rd95, %rd3; -; CHECK-NEXT: and.b64 %rd97, %rd95, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd115, %rd88, %rd96; -; CHECK-NEXT: subc.cc.s64 %rd116, %rd85, %rd97; -; CHECK-NEXT: add.cc.s64 %rd113, %rd113, -1; -; CHECK-NEXT: addc.cc.s64 %rd114, %rd114, -1; -; CHECK-NEXT: or.b64 %rd98, %rd113, %rd114; -; CHECK-NEXT: setp.eq.b64 %p21, %rd98, 0; -; CHECK-NEXT: @%p21 bra $L__BB4_4; +; CHECK-NEXT: shr.u64 %rd84, %rd116, 63; +; CHECK-NEXT: shl.b64 %rd85, %rd117, 1; +; CHECK-NEXT: or.b64 %rd86, %rd85, %rd84; +; CHECK-NEXT: shl.b64 %rd87, %rd116, 1; +; CHECK-NEXT: shr.u64 %rd88, %rd119, 63; +; CHECK-NEXT: or.b64 %rd89, %rd87, %rd88; +; CHECK-NEXT: shr.u64 %rd90, %rd118, 63; +; CHECK-NEXT: shl.b64 %rd91, %rd119, 1; +; CHECK-NEXT: or.b64 %rd92, %rd91, %rd90; +; CHECK-NEXT: shl.b64 %rd93, %rd118, 1; +; CHECK-NEXT: or.b64 %rd118, %rd112, %rd93; +; CHECK-NEXT: or.b64 %rd119, %rd109, %rd92; +; CHECK-NEXT: sub.cc.s64 %rd94, %rd35, %rd89; +; CHECK-NEXT: subc.cc.s64 %rd95, %rd36, %rd86; +; CHECK-NEXT: shr.s64 %rd96, %rd95, 63; +; CHECK-NEXT: and.b64 %rd112, %rd96, 1; +; CHECK-NEXT: and.b64 %rd97, %rd96, %rd3; +; CHECK-NEXT: and.b64 %rd98, %rd96, %rd4; +; CHECK-NEXT: sub.cc.s64 %rd116, %rd89, %rd97; +; CHECK-NEXT: subc.cc.s64 %rd117, %rd86, %rd98; +; CHECK-NEXT: add.cc.s64 %rd114, %rd114, -1; +; CHECK-NEXT: addc.cc.s64 %rd115, %rd115, -1; +; CHECK-NEXT: or.b64 %rd99, %rd114, %rd115; +; CHECK-NEXT: setp.eq.b64 %p19, %rd99, 0; +; CHECK-NEXT: @%p19 bra $L__BB4_4; ; CHECK-NEXT: bra.uni $L__BB4_2; ; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd99, %rd117, 63; -; CHECK-NEXT: shl.b64 %rd100, %rd118, 1; -; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99; -; CHECK-NEXT: shl.b64 %rd102, %rd117, 1; -; CHECK-NEXT: or.b64 %rd119, %rd111, %rd102; -; CHECK-NEXT: or.b64 %rd120, %rd108, %rd101; +; CHECK-NEXT: shr.u64 %rd100, %rd118, 63; +; CHECK-NEXT: shl.b64 %rd101, %rd119, 1; +; CHECK-NEXT: or.b64 %rd102, %rd101, %rd100; +; CHECK-NEXT: shl.b64 %rd103, %rd118, 1; +; CHECK-NEXT: or.b64 %rd120, %rd112, %rd103; +; CHECK-NEXT: or.b64 %rd121, %rd109, %rd102; ; CHECK-NEXT: $L__BB4_5: // %udiv-end -; CHECK-NEXT: xor.b64 %rd103, %rd119, %rd5; ; CHECK-NEXT: xor.b64 %rd104, %rd120, %rd5; -; CHECK-NEXT: sub.cc.s64 %rd105, %rd103, %rd5; -; CHECK-NEXT: subc.cc.s64 %rd106, %rd104, %rd5; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106}; +; CHECK-NEXT: xor.b64 %rd105, %rd121, %rd5; +; CHECK-NEXT: sub.cc.s64 %rd106, %rd104, %rd5; +; CHECK-NEXT: subc.cc.s64 %rd107, %rd105, %rd5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd106, %rd107}; ; CHECK-NEXT: ret; %div = sdiv i128 %lhs, %rhs ret i128 %div @@ -446,7 +444,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<18>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<105>; +; CHECK-NEXT: .reg .b64 %rd<107>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0]; @@ -470,92 +468,92 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd95, 0; -; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd6, %rd95, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127; -; CHECK-NEXT: setp.eq.b64 %p7, %rd6, 0; +; CHECK-NEXT: mov.b64 %rd97, 0; +; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; +; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127; +; CHECK-NEXT: setp.eq.b64 %p7, %rd57, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.b64 %p9, %rd6, 0; +; CHECK-NEXT: setp.ne.b64 %p9, %rd57, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd56, %rd5, 127; -; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6; -; CHECK-NEXT: setp.eq.b64 %p12, %rd57, 0; -; CHECK-NEXT: selp.b64 %rd104, 0, %rd42, %p11; -; CHECK-NEXT: selp.b64 %rd103, 0, %rd41, %p11; +; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; +; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; +; CHECK-NEXT: setp.eq.b64 %p12, %rd59, 0; +; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p11; +; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB5_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd97, %rd5, 1; -; CHECK-NEXT: addc.cc.s64 %rd98, %rd6, 0; -; CHECK-NEXT: or.b64 %rd60, %rd97, %rd98; -; CHECK-NEXT: setp.eq.b64 %p14, %rd60, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd5; +; CHECK-NEXT: add.cc.s64 %rd99, %rd56, 1; +; CHECK-NEXT: addc.cc.s64 %rd100, %rd57, 0; +; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100; +; CHECK-NEXT: setp.eq.b64 %p14, %rd62, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd56; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6; +; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7; -; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62; +; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7; +; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8; +; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd102, %rd64, %rd63, %p15; -; CHECK-NEXT: shl.b64 %rd101, %rd41, %r6; -; CHECK-NEXT: mov.b64 %rd92, %rd95; +; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p15; +; CHECK-NEXT: shl.b64 %rd103, %rd41, %r6; +; CHECK-NEXT: mov.b64 %rd94, %rd97; ; CHECK-NEXT: @%p14 bra $L__BB5_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd97; -; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd99; +; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10; -; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68; +; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10; +; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11; +; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; -; CHECK-NEXT: selp.b64 %rd99, %rd70, %rd69, %p16; -; CHECK-NEXT: shr.u64 %rd100, %rd42, %r9; +; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p16; +; CHECK-NEXT: shr.u64 %rd102, %rd42, %r9; ; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1; -; CHECK-NEXT: mov.b64 %rd92, 0; -; CHECK-NEXT: mov.b64 %rd95, %rd92; +; CHECK-NEXT: mov.b64 %rd94, 0; +; CHECK-NEXT: mov.b64 %rd97, %rd94; ; CHECK-NEXT: $L__BB5_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd71, %rd99, 63; -; CHECK-NEXT: shl.b64 %rd72, %rd100, 1; -; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71; -; CHECK-NEXT: shl.b64 %rd74, %rd99, 1; -; CHECK-NEXT: shr.u64 %rd75, %rd102, 63; -; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; -; CHECK-NEXT: shr.u64 %rd77, %rd101, 63; -; CHECK-NEXT: shl.b64 %rd78, %rd102, 1; -; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77; -; CHECK-NEXT: shl.b64 %rd80, %rd101, 1; -; CHECK-NEXT: or.b64 %rd101, %rd95, %rd80; -; CHECK-NEXT: or.b64 %rd102, %rd92, %rd79; -; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76; -; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73; -; CHECK-NEXT: shr.s64 %rd83, %rd82, 63; -; CHECK-NEXT: and.b64 %rd95, %rd83, 1; -; CHECK-NEXT: and.b64 %rd84, %rd83, %rd43; -; CHECK-NEXT: and.b64 %rd85, %rd83, %rd44; -; CHECK-NEXT: sub.cc.s64 %rd99, %rd76, %rd84; -; CHECK-NEXT: subc.cc.s64 %rd100, %rd73, %rd85; -; CHECK-NEXT: add.cc.s64 %rd97, %rd97, -1; -; CHECK-NEXT: addc.cc.s64 %rd98, %rd98, -1; -; CHECK-NEXT: or.b64 %rd86, %rd97, %rd98; -; CHECK-NEXT: setp.eq.b64 %p17, %rd86, 0; +; CHECK-NEXT: shr.u64 %rd73, %rd101, 63; +; CHECK-NEXT: shl.b64 %rd74, %rd102, 1; +; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; +; CHECK-NEXT: shl.b64 %rd76, %rd101, 1; +; CHECK-NEXT: shr.u64 %rd77, %rd104, 63; +; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; +; CHECK-NEXT: shr.u64 %rd79, %rd103, 63; +; CHECK-NEXT: shl.b64 %rd80, %rd104, 1; +; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; +; CHECK-NEXT: shl.b64 %rd82, %rd103, 1; +; CHECK-NEXT: or.b64 %rd103, %rd97, %rd82; +; CHECK-NEXT: or.b64 %rd104, %rd94, %rd81; +; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; +; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; +; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; +; CHECK-NEXT: and.b64 %rd97, %rd85, 1; +; CHECK-NEXT: and.b64 %rd86, %rd85, %rd43; +; CHECK-NEXT: and.b64 %rd87, %rd85, %rd44; +; CHECK-NEXT: sub.cc.s64 %rd101, %rd78, %rd86; +; CHECK-NEXT: subc.cc.s64 %rd102, %rd75, %rd87; +; CHECK-NEXT: add.cc.s64 %rd99, %rd99, -1; +; CHECK-NEXT: addc.cc.s64 %rd100, %rd100, -1; +; CHECK-NEXT: or.b64 %rd88, %rd99, %rd100; +; CHECK-NEXT: setp.eq.b64 %p17, %rd88, 0; ; CHECK-NEXT: @%p17 bra $L__BB5_4; ; CHECK-NEXT: bra.uni $L__BB5_2; ; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd87, %rd101, 63; -; CHECK-NEXT: shl.b64 %rd88, %rd102, 1; -; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87; -; CHECK-NEXT: shl.b64 %rd90, %rd101, 1; -; CHECK-NEXT: or.b64 %rd103, %rd95, %rd90; -; CHECK-NEXT: or.b64 %rd104, %rd92, %rd89; +; CHECK-NEXT: shr.u64 %rd89, %rd103, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd104, 1; +; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; +; CHECK-NEXT: shl.b64 %rd92, %rd103, 1; +; CHECK-NEXT: or.b64 %rd105, %rd97, %rd92; +; CHECK-NEXT: or.b64 %rd106, %rd94, %rd91; ; CHECK-NEXT: $L__BB5_5: // %udiv-end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd103, %rd104}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106}; ; CHECK-NEXT: ret; %div = udiv i128 %lhs, %rhs ret i128 %div diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py new file mode 100644 index 0000000..8f50206 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx78-sm90.py @@ -0,0 +1,14 @@ +# Check all variants of instructions supported by PTX78 on SM90 +# RUN: %python %s --ptx=78 --gpu-arch=90 --aa > %t-ptx78-sm_90.ll +# RUN: FileCheck %t-ptx78-sm_90.ll < %t-ptx78-sm_90.ll \ +# RUN: --check-prefixes=PTX78STMATRIX-DAG +# RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \ +# RUN: | FileCheck %t-ptx78-sm_90.ll +# RUN: %if ptxas-12.7 %{ \ +# RUN: llc < %t-ptx78-sm_90.ll -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 \ +# RUN: | %ptxas-verify -arch=sm_90 \ +# RUN: %} + +import wmma + +wmma.main() diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py index 6ad0a2a..5c14a54 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm100a.py @@ -1,9 +1,7 @@ # Check all variants of instructions supported by PTX86 on SM100a # RUN: %python %s --ptx=86 --gpu-arch=100 --aa > %t-ptx86-sm_100a.ll # RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG -# RUN: FileCheck %t-ptx86-sm_100a.ll < %t-ptx86-sm_100a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG +# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_100a.ll -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_100a.ll # RUN: %if ptxas-12.7 %{ \ diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py index 7d99534..a77f9ad 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm101a.py @@ -1,9 +1,7 @@ # Check all variants of instructions supported by PTX86 on SM101a # RUN: %python %s --ptx=86 --gpu-arch=101 --aa > %t-ptx86-sm_101a.ll # RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG -# RUN: FileCheck %t-ptx86-sm_101a.ll < %t-ptx86-sm_101a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG +# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_101a.ll -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_101a.ll # RUN: %if ptxas-12.7 %{ \ diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py index 7bddf0b..8126e64 100644 --- a/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py +++ b/llvm/test/CodeGen/NVPTX/wmma-ptx86-sm120a.py @@ -1,9 +1,7 @@ # Check all variants of instructions supported by PTX86 on SM120a # RUN: %python %s --ptx=86 --gpu-arch=120 --aa > %t-ptx86-sm_120a.ll # RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG -# RUN: FileCheck %t-ptx86-sm_120a.ll < %t-ptx86-sm_120a.ll \ -# RUN: --check-prefixes=PTX86LDMATRIX-DAG +# RUN: --check-prefixes=PTX86LDMATRIX-DAG,PTX86STMATRIX-DAG # RUN: llc < %t-ptx86-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx86 \ # RUN: | FileCheck %t-ptx86-sm_120a.ll # RUN: %if ptxas-12.7 %{ \ diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py index 2ee4896..2eb3c3d 100644 --- a/llvm/test/CodeGen/NVPTX/wmma.py +++ b/llvm/test/CodeGen/NVPTX/wmma.py @@ -10,6 +10,7 @@ import argparse from itertools import product from string import Template + class MMAType: def __init__(self, ptx_type): self.ptx_type = ptx_type @@ -176,6 +177,13 @@ class MMAFrag: "m8n16:x1:b8x16.b4x16_p64": 1, "m8n16:x2:b8x16.b4x16_p64": 2, "m8n16:x4:b8x16.b4x16_p64": 4, + # stmatrix + "m8n8:x1:b16": 1, + "m8n8:x2:b16": 2, + "m8n8:x4:b16": 4, + "m16n8:x1:b8": 1, + "m16n8:x2:b8": 2, + "m16n8:x4:b8": 4, }.get( "%s:%s:%s" % (geom, frag, ptx_elt_type), { @@ -241,6 +249,13 @@ def make_ldmatrix_ops(geoms, frags, types): ] +def make_stmatrix_ops(geoms, frags, types): + return [ + MMAFrag(geom, frag, ptx_type) + for (geom, frag, ptx_type) in product(geoms, frags, types) + ] + + def get_wmma_ops(): return ( make_mma_ops(["m16n16k8"], ["tf32"], [], ["f32"], []) @@ -315,6 +330,12 @@ def get_ldmatrix_ops(): ) +def get_stmatrix_ops(): + return make_stmatrix_ops(["m8n8"], ["x1", "x2", "x4"], ["b16"]) + make_stmatrix_ops( + ["m16n8"], ["x1", "x2", "x4"], ["b8"] + ) + + def is_wmma_geom_supported(geom): # geometries for FP and ints. if geom in ["m8n32k16", "m32n8k16"]: @@ -360,6 +381,14 @@ def is_ldmatrix_geom_supported(geom): assert False # Unexpected geometry. +def is_stmatrix_geom_supported(geom): + if geom in ["m8n8"]: + return ptx_version >= 78 and gpu_arch >= 90 + elif geom in ["m16n8"]: + return ptx_version >= 86 and gpu_arch >= 100 and aa + assert False # Unexpected geometry. + + def is_ldmatrix_trans_supported(geom, trans): if geom in ["m8n8"]: return True @@ -369,6 +398,15 @@ def is_ldmatrix_trans_supported(geom, trans): return trans == "" assert False # Unexpected geometry. + +def is_stmatrix_trans_supported(geom, trans): + if geom in ["m8n8"]: + return True + elif geom in ["m16n8"]: + return trans == ".trans" + assert False # Unexpected geometry. + + def is_type_supported(ptx_type): if ptx_type in ["s8", "u8", "s32"]: return ptx_version >= 63 and gpu_arch >= 72 @@ -463,6 +501,16 @@ def is_ldmatrix_variant_supported(frag, trans): return frag.frag in ["x1", "x2", "x4"] +def is_stmatrix_variant_supported(frag, trans): + if not ( + is_type_supported(frag.mma_type.ptx_type) + and is_stmatrix_geom_supported(frag.geom) + and is_stmatrix_trans_supported(frag.geom, trans) + ): + return False + return frag.frag in ["x1", "x2", "x4"] + + def make_wmma_slice_ty(frag): return [frag.mma_type.llvm_type] * frag.nregs @@ -717,6 +765,65 @@ define ${ret_ty} @test_${function}_o(i8 ${as}* %src) { return generated_items +def gen_stmatrix_tests(): + stmatrix_template = """ +declare void @${intrinsic}(i8 ${as}* %dst, ${args}); + +; CHECK-LABEL: .func {{.*}}test_${function}( +define void @test_${function}(i8 ${as}* %dst, ${args}) { +; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}] +; CHECK: {${check_args}} + call void @${intrinsic}(i8${as}* %dst, ${args}); + ret void +} + +; CHECK-LABEL: .func{{.*}}test_${function}_o( +define void @test_${function}_o(i8 ${as}* %dst, ${args}) { +; CHECK: ${instruction} {{.*}}[%rd{{[0-9+]}}+128], +; CHECK: {${check_args}} + %dst1 = getelementptr i8, i8 ${as}* %dst, i32 128; + call void @${intrinsic}(i8 ${as}* %dst1, ${args}); + ret void +} +""" + intrinsic_template = ( + "llvm.nvvm.stmatrix.sync.aligned.${geom}.${frag}${trans}.${itype}.${pspace}" + ) + instruction_template = ( + "stmatrix.sync.aligned.${geom}.${frag}${trans}${space}.${itype}" + ) + generated_items = [] + + for frag, space, trans in product( + get_stmatrix_ops(), + ["", ".shared"], + ["", ".trans"], + ): + if not is_stmatrix_variant_supported(frag, trans): + continue + + params = { + "frag": frag.frag, + "space": space, + "trans": trans, + "itype": frag.mma_type.ptx_type, + "pspace": get_pspace(space), + "as": "addrspace(%d)" % get_aspace(space), + "geom": frag.geom, + } + + test_params = params + test_params["intrinsic"] = Template(intrinsic_template).substitute(params) + test_params["function"] = test_params["intrinsic"].replace(".", "_") + test_params["instruction"] = Template(instruction_template).substitute(params) + test_params["args"] = make_wmma_slice_args(frag) + test_params["check_args"] = check_pattern(frag) + + print(Template(stmatrix_template).substitute(test_params)) + generated_items.append((test_params["intrinsic"], test_params["instruction"])) + + return generated_items + def mma_signature(op): if op.a.mma_type.ptx_type == "f16": # FP16 ops identified by accumulator & result type. @@ -893,6 +1000,7 @@ def gen_check_unsupported_ops(items): ; NOALTFLOAT-NOT: .{{bf16|tf32}} ; NODOUBLE-NOT: .f64 ; NOLDMATRIX-NOT: ldmatrix.sync.aligned +; NOSTMATRIX-NOT: stmatrix.sync.aligned ; M16N16-DAG: m16n16k16.load.{{[ab].*}}.f16.p ; M16N16-DAG: m16n16k16.{{load|store}}.{{[cd].*\.(f16|f32)}}.p @@ -994,6 +1102,26 @@ def gen_check_unsupported_ops(items): ; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b6x16_p32 ; PTX86LDMATRIX-DAG: ldmatrix.sync.aligned.m8n16.x4.b8x16.b4x16_p64 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.trans.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.trans.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.trans.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x1.trans.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x2.trans.shared.b16 +; PTX78STMATRIX-DAG: stmatrix.sync.aligned.m8n8.x4.trans.shared.b16 + +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x1.trans.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x2.trans.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x4.trans.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x1.trans.shared.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x2.trans.shared.b8 +; PTX86STMATRIX-DAG: stmatrix.sync.aligned.m16n8.x4.trans.shared.b8 + ; PTX71MMA-DAG: mma.m8n8k4.row.col.f64 ; PTX71MMA-DAG: mma.m16n8k4.row.col.tf32 ; PTX71MMA-DAG: mma.m16n8k8.row.col.tf32 @@ -1039,6 +1167,7 @@ def gen_tests(): items = gen_wmma_load_tests() items += gen_wmma_store_tests() items += gen_ldmatrix_tests() + items += gen_stmatrix_tests() items += gen_wmma_mma_tests() items += gen_mma_tests() gen_check_unsupported_ops(items) diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll index 821cfd0..b540948 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll @@ -764,8 +764,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; ; CHECK-PWR7-LABEL: sub_absv_8_ext: ; CHECK-PWR7: # %bb.0: # %entry -; CHECK-PWR7-NEXT: stdu r1, -448(r1) -; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 448 +; CHECK-PWR7-NEXT: stdu r1, -512(r1) +; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 512 +; CHECK-PWR7-NEXT: .cfi_offset r14, -144 +; CHECK-PWR7-NEXT: .cfi_offset r15, -136 +; CHECK-PWR7-NEXT: .cfi_offset r16, -128 +; CHECK-PWR7-NEXT: .cfi_offset r17, -120 +; CHECK-PWR7-NEXT: .cfi_offset r18, -112 ; CHECK-PWR7-NEXT: .cfi_offset r19, -104 ; CHECK-PWR7-NEXT: .cfi_offset r20, -96 ; CHECK-PWR7-NEXT: .cfi_offset r21, -88 @@ -778,258 +783,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR7-NEXT: .cfi_offset r28, -32 ; CHECK-PWR7-NEXT: .cfi_offset r29, -24 ; CHECK-PWR7-NEXT: .cfi_offset r30, -16 -; CHECK-PWR7-NEXT: addi r3, r1, 304 -; CHECK-PWR7-NEXT: std r19, 344(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r20, 352(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r21, 360(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r22, 368(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r23, 376(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r24, 384(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r25, 392(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r26, 400(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r27, 408(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r28, 416(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r29, 424(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r30, 432(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3 +; CHECK-PWR7-NEXT: .cfi_offset r31, -8 +; CHECK-PWR7-NEXT: .cfi_offset r2, -152 ; CHECK-PWR7-NEXT: addi r3, r1, 320 -; CHECK-PWR7-NEXT: lbz r7, 304(r1) -; CHECK-PWR7-NEXT: stxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: lbz r8, 320(r1) -; CHECK-PWR7-NEXT: lbz r9, 305(r1) -; CHECK-PWR7-NEXT: lbz r10, 321(r1) -; CHECK-PWR7-NEXT: lbz r26, 325(r1) -; CHECK-PWR7-NEXT: clrlwi r7, r7, 24 -; CHECK-PWR7-NEXT: clrlwi r8, r8, 24 -; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 -; CHECK-PWR7-NEXT: clrlwi r10, r10, 24 -; CHECK-PWR7-NEXT: lbz r11, 306(r1) -; CHECK-PWR7-NEXT: lbz r12, 322(r1) -; CHECK-PWR7-NEXT: lbz r23, 314(r1) -; CHECK-PWR7-NEXT: clrlwi r22, r26, 24 -; CHECK-PWR7-NEXT: lbz r26, 330(r1) -; CHECK-PWR7-NEXT: sub r8, r7, r8 -; CHECK-PWR7-NEXT: lbz r7, 315(r1) -; CHECK-PWR7-NEXT: sub r20, r9, r10 -; CHECK-PWR7-NEXT: lbz r9, 331(r1) -; CHECK-PWR7-NEXT: lbz r0, 307(r1) -; CHECK-PWR7-NEXT: lbz r30, 323(r1) -; CHECK-PWR7-NEXT: clrlwi r11, r11, 24 -; CHECK-PWR7-NEXT: clrlwi r12, r12, 24 -; CHECK-PWR7-NEXT: clrlwi r23, r23, 24 -; CHECK-PWR7-NEXT: clrlwi r21, r26, 24 -; CHECK-PWR7-NEXT: clrlwi r7, r7, 24 -; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 -; CHECK-PWR7-NEXT: clrlwi r0, r0, 24 -; CHECK-PWR7-NEXT: clrlwi r30, r30, 24 -; CHECK-PWR7-NEXT: lbz r29, 308(r1) -; CHECK-PWR7-NEXT: lbz r28, 324(r1) -; CHECK-PWR7-NEXT: lbz r27, 309(r1) -; CHECK-PWR7-NEXT: lbz r25, 310(r1) -; CHECK-PWR7-NEXT: lbz r24, 326(r1) -; CHECK-PWR7-NEXT: sub r19, r11, r12 -; CHECK-PWR7-NEXT: sub r11, r23, r21 -; CHECK-PWR7-NEXT: sub r9, r7, r9 -; CHECK-PWR7-NEXT: sub r26, r0, r30 -; CHECK-PWR7-NEXT: srawi r12, r11, 31 -; CHECK-PWR7-NEXT: srawi r0, r9, 31 -; CHECK-PWR7-NEXT: lbz r3, 312(r1) -; CHECK-PWR7-NEXT: clrlwi r29, r29, 24 -; CHECK-PWR7-NEXT: clrlwi r28, r28, 24 -; CHECK-PWR7-NEXT: clrlwi r27, r27, 24 -; CHECK-PWR7-NEXT: clrlwi r25, r25, 24 -; CHECK-PWR7-NEXT: clrlwi r24, r24, 24 -; CHECK-PWR7-NEXT: xor r11, r11, r12 -; CHECK-PWR7-NEXT: xor r9, r9, r0 -; CHECK-PWR7-NEXT: sub r28, r29, r28 -; CHECK-PWR7-NEXT: sub r30, r27, r22 -; CHECK-PWR7-NEXT: sub r29, r25, r24 -; CHECK-PWR7-NEXT: sub r27, r11, r12 -; CHECK-PWR7-NEXT: sub r24, r9, r0 -; CHECK-PWR7-NEXT: lbz r9, 316(r1) -; CHECK-PWR7-NEXT: lbz r11, 332(r1) -; CHECK-PWR7-NEXT: lbz r4, 328(r1) -; CHECK-PWR7-NEXT: lbz r5, 311(r1) -; CHECK-PWR7-NEXT: lbz r6, 327(r1) -; CHECK-PWR7-NEXT: clrlwi r11, r11, 24 -; CHECK-PWR7-NEXT: clrlwi r3, r3, 24 -; CHECK-PWR7-NEXT: clrlwi r4, r4, 24 -; CHECK-PWR7-NEXT: clrlwi r5, r5, 24 -; CHECK-PWR7-NEXT: clrlwi r6, r6, 24 -; CHECK-PWR7-NEXT: sub r3, r3, r4 +; CHECK-PWR7-NEXT: std r14, 368(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r15, 376(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r16, 384(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r17, 392(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r18, 400(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r19, 408(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r20, 416(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r21, 424(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r22, 432(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r23, 440(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r24, 448(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r25, 456(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r26, 464(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r27, 472(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r28, 480(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r29, 488(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r30, 496(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r31, 504(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r2, 360(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3 +; CHECK-PWR7-NEXT: lbz r3, 320(r1) +; CHECK-PWR7-NEXT: addi r4, r1, 336 +; CHECK-PWR7-NEXT: stw r3, 60(r1) # 4-byte Folded Spill +; CHECK-PWR7-NEXT: stxvw4x v3, 0, r4 +; CHECK-PWR7-NEXT: lbz r15, 334(r1) +; CHECK-PWR7-NEXT: lbz r14, 350(r1) +; CHECK-PWR7-NEXT: lbz r31, 335(r1) +; CHECK-PWR7-NEXT: lbz r2, 351(r1) +; CHECK-PWR7-NEXT: sub r15, r15, r14 +; CHECK-PWR7-NEXT: sub r14, r31, r2 +; CHECK-PWR7-NEXT: srawi r2, r14, 31 +; CHECK-PWR7-NEXT: xor r14, r14, r2 +; CHECK-PWR7-NEXT: lbz r3, 333(r1) +; CHECK-PWR7-NEXT: lbz r19, 331(r1) +; CHECK-PWR7-NEXT: lbz r18, 347(r1) +; CHECK-PWR7-NEXT: sub r19, r19, r18 +; CHECK-PWR7-NEXT: lbz r17, 332(r1) +; CHECK-PWR7-NEXT: lbz r16, 348(r1) +; CHECK-PWR7-NEXT: sub r17, r17, r16 +; CHECK-PWR7-NEXT: lbz r23, 329(r1) +; CHECK-PWR7-NEXT: sub r14, r14, r2 +; CHECK-PWR7-NEXT: lbz r2, 349(r1) +; CHECK-PWR7-NEXT: lbz r22, 345(r1) +; CHECK-PWR7-NEXT: lbz r4, 336(r1) +; CHECK-PWR7-NEXT: lbz r5, 321(r1) +; CHECK-PWR7-NEXT: lbz r6, 337(r1) +; CHECK-PWR7-NEXT: lbz r7, 322(r1) +; CHECK-PWR7-NEXT: lbz r8, 338(r1) +; CHECK-PWR7-NEXT: lbz r9, 323(r1) +; CHECK-PWR7-NEXT: lbz r10, 339(r1) +; CHECK-PWR7-NEXT: lbz r11, 324(r1) +; CHECK-PWR7-NEXT: lbz r12, 340(r1) +; CHECK-PWR7-NEXT: lbz r0, 325(r1) +; CHECK-PWR7-NEXT: lbz r30, 341(r1) +; CHECK-PWR7-NEXT: lbz r29, 326(r1) +; CHECK-PWR7-NEXT: lbz r28, 342(r1) +; CHECK-PWR7-NEXT: lbz r27, 327(r1) +; CHECK-PWR7-NEXT: lbz r26, 343(r1) +; CHECK-PWR7-NEXT: sub r3, r3, r2 +; CHECK-PWR7-NEXT: lbz r25, 328(r1) +; CHECK-PWR7-NEXT: lbz r24, 344(r1) +; CHECK-PWR7-NEXT: lbz r21, 330(r1) +; CHECK-PWR7-NEXT: lbz r20, 346(r1) ; CHECK-PWR7-NEXT: sub r5, r5, r6 -; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 -; CHECK-PWR7-NEXT: srawi r4, r3, 31 +; CHECK-PWR7-NEXT: srawi r18, r3, 31 +; CHECK-PWR7-NEXT: sub r7, r7, r8 +; CHECK-PWR7-NEXT: sub r9, r9, r10 +; CHECK-PWR7-NEXT: sub r11, r11, r12 +; CHECK-PWR7-NEXT: sub r0, r0, r30 +; CHECK-PWR7-NEXT: sub r29, r29, r28 +; CHECK-PWR7-NEXT: sub r27, r27, r26 +; CHECK-PWR7-NEXT: sub r25, r25, r24 +; CHECK-PWR7-NEXT: srawi r31, r15, 31 +; CHECK-PWR7-NEXT: ld r2, 360(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: xor r3, r3, r18 ; CHECK-PWR7-NEXT: srawi r6, r5, 31 -; CHECK-PWR7-NEXT: xor r3, r3, r4 -; CHECK-PWR7-NEXT: sldi r27, r27, 56 -; CHECK-PWR7-NEXT: xor r5, r5, r6 -; CHECK-PWR7-NEXT: sub r9, r9, r11 -; CHECK-PWR7-NEXT: sub r3, r3, r4 -; CHECK-PWR7-NEXT: sldi r24, r24, 56 +; CHECK-PWR7-NEXT: srawi r8, r7, 31 +; CHECK-PWR7-NEXT: srawi r10, r9, 31 +; CHECK-PWR7-NEXT: srawi r12, r11, 31 +; CHECK-PWR7-NEXT: srawi r30, r0, 31 +; CHECK-PWR7-NEXT: sub r3, r3, r18 +; CHECK-PWR7-NEXT: srawi r18, r19, 31 +; CHECK-PWR7-NEXT: srawi r28, r29, 31 +; CHECK-PWR7-NEXT: ld r16, 384(r1) # 8-byte Folded Reload ; CHECK-PWR7-NEXT: sldi r3, r3, 56 -; CHECK-PWR7-NEXT: srawi r11, r9, 31 -; CHECK-PWR7-NEXT: std r27, 208(r1) -; CHECK-PWR7-NEXT: sub r4, r5, r6 -; CHECK-PWR7-NEXT: std r27, 216(r1) -; CHECK-PWR7-NEXT: srawi r27, r29, 31 -; CHECK-PWR7-NEXT: lbz r10, 313(r1) -; CHECK-PWR7-NEXT: xor r9, r9, r11 -; CHECK-PWR7-NEXT: std r24, 224(r1) -; CHECK-PWR7-NEXT: lbz r22, 329(r1) -; CHECK-PWR7-NEXT: std r24, 232(r1) -; CHECK-PWR7-NEXT: srawi r24, r30, 31 -; CHECK-PWR7-NEXT: ld r21, 360(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: sub r23, r9, r11 -; CHECK-PWR7-NEXT: lbz r9, 317(r1) -; CHECK-PWR7-NEXT: lbz r11, 333(r1) -; CHECK-PWR7-NEXT: xor r29, r29, r27 -; CHECK-PWR7-NEXT: std r3, 176(r1) -; CHECK-PWR7-NEXT: std r3, 184(r1) -; CHECK-PWR7-NEXT: sldi r3, r4, 56 -; CHECK-PWR7-NEXT: sldi r23, r23, 56 -; CHECK-PWR7-NEXT: xor r30, r30, r24 -; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 -; CHECK-PWR7-NEXT: clrlwi r11, r11, 24 -; CHECK-PWR7-NEXT: sub r4, r30, r24 -; CHECK-PWR7-NEXT: ld r30, 432(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: std r3, 160(r1) -; CHECK-PWR7-NEXT: std r3, 168(r1) -; CHECK-PWR7-NEXT: sub r9, r9, r11 -; CHECK-PWR7-NEXT: sub r3, r29, r27 -; CHECK-PWR7-NEXT: std r23, 240(r1) -; CHECK-PWR7-NEXT: ld r29, 424(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: srawi r11, r9, 31 -; CHECK-PWR7-NEXT: std r23, 248(r1) -; CHECK-PWR7-NEXT: ld r27, 408(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: srawi r23, r28, 31 +; CHECK-PWR7-NEXT: srawi r26, r27, 31 +; CHECK-PWR7-NEXT: srawi r24, r25, 31 +; CHECK-PWR7-NEXT: xor r19, r19, r18 +; CHECK-PWR7-NEXT: xor r15, r15, r31 +; CHECK-PWR7-NEXT: xor r5, r5, r6 +; CHECK-PWR7-NEXT: std r3, 272(r1) +; CHECK-PWR7-NEXT: std r3, 280(r1) +; CHECK-PWR7-NEXT: srawi r3, r17, 31 +; CHECK-PWR7-NEXT: sub r19, r19, r18 +; CHECK-PWR7-NEXT: xor r7, r7, r8 +; CHECK-PWR7-NEXT: sub r15, r15, r31 +; CHECK-PWR7-NEXT: xor r17, r17, r3 +; CHECK-PWR7-NEXT: xor r9, r9, r10 +; CHECK-PWR7-NEXT: xor r11, r11, r12 +; CHECK-PWR7-NEXT: xor r0, r0, r30 +; CHECK-PWR7-NEXT: xor r29, r29, r28 +; CHECK-PWR7-NEXT: xor r27, r27, r26 +; CHECK-PWR7-NEXT: sub r3, r17, r3 +; CHECK-PWR7-NEXT: xor r25, r25, r24 +; CHECK-PWR7-NEXT: sub r25, r25, r24 +; CHECK-PWR7-NEXT: sub r27, r27, r26 +; CHECK-PWR7-NEXT: sub r29, r29, r28 ; CHECK-PWR7-NEXT: sldi r3, r3, 56 -; CHECK-PWR7-NEXT: xor r28, r28, r23 -; CHECK-PWR7-NEXT: xor r9, r9, r11 -; CHECK-PWR7-NEXT: std r3, 144(r1) -; CHECK-PWR7-NEXT: ld r24, 384(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: std r3, 152(r1) -; CHECK-PWR7-NEXT: sldi r3, r4, 56 -; CHECK-PWR7-NEXT: sub r25, r9, r11 -; CHECK-PWR7-NEXT: lbz r9, 318(r1) -; CHECK-PWR7-NEXT: lbz r11, 334(r1) -; CHECK-PWR7-NEXT: std r3, 128(r1) +; CHECK-PWR7-NEXT: sub r0, r0, r30 +; CHECK-PWR7-NEXT: sub r11, r11, r12 +; CHECK-PWR7-NEXT: sub r9, r9, r10 +; CHECK-PWR7-NEXT: sub r7, r7, r8 +; CHECK-PWR7-NEXT: sub r5, r5, r6 +; CHECK-PWR7-NEXT: sldi r14, r14, 56 +; CHECK-PWR7-NEXT: sldi r15, r15, 56 +; CHECK-PWR7-NEXT: ld r31, 504(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r3, 256(r1) +; CHECK-PWR7-NEXT: std r3, 264(r1) +; CHECK-PWR7-NEXT: sldi r3, r19, 56 ; CHECK-PWR7-NEXT: sldi r25, r25, 56 -; CHECK-PWR7-NEXT: std r3, 136(r1) -; CHECK-PWR7-NEXT: sub r3, r28, r23 +; CHECK-PWR7-NEXT: sldi r27, r27, 56 +; CHECK-PWR7-NEXT: std r3, 240(r1) +; CHECK-PWR7-NEXT: std r3, 248(r1) +; CHECK-PWR7-NEXT: sub r3, r23, r22 +; CHECK-PWR7-NEXT: srawi r23, r3, 31 +; CHECK-PWR7-NEXT: sub r22, r21, r20 +; CHECK-PWR7-NEXT: srawi r21, r22, 31 +; CHECK-PWR7-NEXT: sldi r29, r29, 56 +; CHECK-PWR7-NEXT: sldi r0, r0, 56 +; CHECK-PWR7-NEXT: sldi r11, r11, 56 +; CHECK-PWR7-NEXT: xor r3, r3, r23 +; CHECK-PWR7-NEXT: xor r22, r22, r21 +; CHECK-PWR7-NEXT: sldi r9, r9, 56 +; CHECK-PWR7-NEXT: sldi r7, r7, 56 +; CHECK-PWR7-NEXT: sldi r5, r5, 56 +; CHECK-PWR7-NEXT: ld r30, 496(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: ld r28, 480(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: sub r3, r3, r23 +; CHECK-PWR7-NEXT: sub r22, r22, r21 +; CHECK-PWR7-NEXT: std r14, 304(r1) +; CHECK-PWR7-NEXT: ld r26, 464(r1) # 8-byte Folded Reload ; CHECK-PWR7-NEXT: sldi r3, r3, 56 -; CHECK-PWR7-NEXT: std r3, 112(r1) -; CHECK-PWR7-NEXT: ld r28, 416(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 -; CHECK-PWR7-NEXT: clrlwi r11, r11, 24 -; CHECK-PWR7-NEXT: clrlwi r10, r10, 24 -; CHECK-PWR7-NEXT: std r25, 256(r1) -; CHECK-PWR7-NEXT: std r25, 264(r1) -; CHECK-PWR7-NEXT: sub r9, r9, r11 -; CHECK-PWR7-NEXT: srawi r25, r26, 31 -; CHECK-PWR7-NEXT: xor r26, r26, r25 -; CHECK-PWR7-NEXT: ld r23, 376(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: srawi r11, r9, 31 -; CHECK-PWR7-NEXT: std r3, 120(r1) -; CHECK-PWR7-NEXT: sub r4, r26, r25 -; CHECK-PWR7-NEXT: clrlwi r22, r22, 24 -; CHECK-PWR7-NEXT: srawi r7, r8, 31 -; CHECK-PWR7-NEXT: sub r10, r10, r22 -; CHECK-PWR7-NEXT: ld r26, 400(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: xor r9, r9, r11 -; CHECK-PWR7-NEXT: sldi r3, r4, 56 -; CHECK-PWR7-NEXT: srawi r22, r10, 31 -; CHECK-PWR7-NEXT: xor r8, r8, r7 -; CHECK-PWR7-NEXT: xor r10, r10, r22 -; CHECK-PWR7-NEXT: sub r10, r10, r22 -; CHECK-PWR7-NEXT: ld r25, 392(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: sub r12, r9, r11 -; CHECK-PWR7-NEXT: lbz r9, 319(r1) -; CHECK-PWR7-NEXT: lbz r11, 335(r1) -; CHECK-PWR7-NEXT: std r3, 96(r1) -; CHECK-PWR7-NEXT: sldi r12, r12, 56 -; CHECK-PWR7-NEXT: std r3, 104(r1) -; CHECK-PWR7-NEXT: ld r22, 368(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: sldi r10, r10, 56 -; CHECK-PWR7-NEXT: std r10, 192(r1) -; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 -; CHECK-PWR7-NEXT: clrlwi r11, r11, 24 -; CHECK-PWR7-NEXT: sub r9, r9, r11 -; CHECK-PWR7-NEXT: std r12, 272(r1) -; CHECK-PWR7-NEXT: std r12, 280(r1) -; CHECK-PWR7-NEXT: srawi r12, r19, 31 -; CHECK-PWR7-NEXT: xor r0, r19, r12 -; CHECK-PWR7-NEXT: ld r19, 344(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: sub r3, r0, r12 -; CHECK-PWR7-NEXT: srawi r11, r9, 31 -; CHECK-PWR7-NEXT: std r10, 200(r1) -; CHECK-PWR7-NEXT: xor r9, r9, r11 +; CHECK-PWR7-NEXT: sldi r22, r22, 56 +; CHECK-PWR7-NEXT: ld r24, 448(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: ld r23, 440(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r14, 312(r1) +; CHECK-PWR7-NEXT: std r15, 288(r1) +; CHECK-PWR7-NEXT: std r3, 208(r1) +; CHECK-PWR7-NEXT: std r3, 216(r1) +; CHECK-PWR7-NEXT: lwz r3, 60(r1) # 4-byte Folded Reload +; CHECK-PWR7-NEXT: std r15, 296(r1) +; CHECK-PWR7-NEXT: ld r21, 424(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: ld r20, 416(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r22, 224(r1) +; CHECK-PWR7-NEXT: std r22, 232(r1) +; CHECK-PWR7-NEXT: sub r4, r3, r4 +; CHECK-PWR7-NEXT: std r25, 192(r1) +; CHECK-PWR7-NEXT: ld r22, 432(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: ld r19, 408(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: srawi r3, r4, 31 +; CHECK-PWR7-NEXT: std r25, 200(r1) +; CHECK-PWR7-NEXT: ld r25, 456(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r27, 176(r1) +; CHECK-PWR7-NEXT: std r27, 184(r1) +; CHECK-PWR7-NEXT: xor r4, r4, r3 +; CHECK-PWR7-NEXT: std r29, 160(r1) +; CHECK-PWR7-NEXT: ld r27, 472(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r29, 168(r1) +; CHECK-PWR7-NEXT: std r0, 144(r1) +; CHECK-PWR7-NEXT: sub r3, r4, r3 +; CHECK-PWR7-NEXT: std r0, 152(r1) +; CHECK-PWR7-NEXT: ld r29, 488(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: ld r18, 400(r1) # 8-byte Folded Reload ; CHECK-PWR7-NEXT: sldi r3, r3, 56 -; CHECK-PWR7-NEXT: sub r9, r9, r11 -; CHECK-PWR7-NEXT: std r3, 80(r1) -; CHECK-PWR7-NEXT: std r3, 88(r1) -; CHECK-PWR7-NEXT: sldi r9, r9, 56 -; CHECK-PWR7-NEXT: std r9, 288(r1) -; CHECK-PWR7-NEXT: std r9, 296(r1) -; CHECK-PWR7-NEXT: srawi r9, r20, 31 -; CHECK-PWR7-NEXT: xor r11, r20, r9 -; CHECK-PWR7-NEXT: ld r20, 352(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: sub r4, r11, r9 -; CHECK-PWR7-NEXT: sldi r3, r4, 56 +; CHECK-PWR7-NEXT: std r11, 128(r1) +; CHECK-PWR7-NEXT: ld r17, 392(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r11, 136(r1) +; CHECK-PWR7-NEXT: std r9, 112(r1) ; CHECK-PWR7-NEXT: std r3, 64(r1) ; CHECK-PWR7-NEXT: std r3, 72(r1) -; CHECK-PWR7-NEXT: sub r3, r8, r7 -; CHECK-PWR7-NEXT: sldi r3, r3, 56 -; CHECK-PWR7-NEXT: std r3, 48(r1) -; CHECK-PWR7-NEXT: std r3, 56(r1) -; CHECK-PWR7-NEXT: addi r3, r1, 288 +; CHECK-PWR7-NEXT: addi r3, r1, 304 +; CHECK-PWR7-NEXT: std r9, 120(r1) +; CHECK-PWR7-NEXT: ld r15, 376(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r7, 96(r1) +; CHECK-PWR7-NEXT: std r7, 104(r1) +; CHECK-PWR7-NEXT: std r5, 80(r1) +; CHECK-PWR7-NEXT: std r5, 88(r1) ; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 272 +; CHECK-PWR7-NEXT: addi r3, r1, 288 ; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 256 +; CHECK-PWR7-NEXT: addi r3, r1, 272 +; CHECK-PWR7-NEXT: ld r14, 368(r1) # 8-byte Folded Reload ; CHECK-PWR7-NEXT: vmrghb v2, v3, v2 ; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 240 +; CHECK-PWR7-NEXT: addi r3, r1, 256 ; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 224 +; CHECK-PWR7-NEXT: addi r3, r1, 240 ; CHECK-PWR7-NEXT: vmrghb v3, v4, v3 ; CHECK-PWR7-NEXT: vmrghh v2, v3, v2 ; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 208 +; CHECK-PWR7-NEXT: addi r3, r1, 224 ; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 192 +; CHECK-PWR7-NEXT: addi r3, r1, 208 ; CHECK-PWR7-NEXT: vmrghb v3, v4, v3 ; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 176 +; CHECK-PWR7-NEXT: addi r3, r1, 192 ; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 160 +; CHECK-PWR7-NEXT: addi r3, r1, 176 ; CHECK-PWR7-NEXT: vmrghb v4, v5, v4 ; CHECK-PWR7-NEXT: vmrghh v3, v4, v3 ; CHECK-PWR7-NEXT: xxmrghw vs0, v3, v2 ; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 144 +; CHECK-PWR7-NEXT: addi r3, r1, 160 ; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 128 +; CHECK-PWR7-NEXT: addi r3, r1, 144 ; CHECK-PWR7-NEXT: vmrghb v2, v3, v2 ; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 112 +; CHECK-PWR7-NEXT: addi r3, r1, 128 ; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 96 ; CHECK-PWR7-NEXT: vmrghb v3, v4, v3 +; CHECK-PWR7-NEXT: addi r3, r1, 112 ; CHECK-PWR7-NEXT: vmrghh v2, v3, v2 ; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 80 +; CHECK-PWR7-NEXT: addi r3, r1, 96 ; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 64 +; CHECK-PWR7-NEXT: addi r3, r1, 80 ; CHECK-PWR7-NEXT: vmrghb v3, v4, v3 ; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 48 +; CHECK-PWR7-NEXT: addi r3, r1, 64 ; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3 ; CHECK-PWR7-NEXT: vmrghb v4, v5, v4 ; CHECK-PWR7-NEXT: vmrghh v3, v4, v3 ; CHECK-PWR7-NEXT: xxmrghw vs1, v3, v2 ; CHECK-PWR7-NEXT: xxmrghd v2, vs1, vs0 -; CHECK-PWR7-NEXT: addi r1, r1, 448 +; CHECK-PWR7-NEXT: addi r1, r1, 512 ; CHECK-PWR7-NEXT: blr entry: %vecext = extractelement <16 x i8> %a, i32 0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll index 4b999b8..6864afe 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll @@ -66,7 +66,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind { ; RV64IM-NEXT: srli a2, a2, 32 ; RV64IM-NEXT: mul a1, a2, a1 ; RV64IM-NEXT: srli a1, a1, 32 -; RV64IM-NEXT: subw a0, a0, a1 +; RV64IM-NEXT: sub a0, a0, a1 ; RV64IM-NEXT: srliw a0, a0, 1 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: srliw a0, a0, 2 @@ -79,7 +79,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind { ; RV64IMZB-NEXT: zext.w a2, a0 ; RV64IMZB-NEXT: mul a1, a2, a1 ; RV64IMZB-NEXT: srli a1, a1, 32 -; RV64IMZB-NEXT: subw a0, a0, a1 +; RV64IMZB-NEXT: sub a0, a0, a1 ; RV64IMZB-NEXT: srliw a0, a0, 1 ; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: srliw a0, a0, 2 @@ -250,7 +250,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind { ; RV64-NEXT: zext.b a2, a0 ; RV64-NEXT: mul a1, a2, a1 ; RV64-NEXT: srli a1, a1, 8 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: zext.b a0, a0 ; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: add a0, a0, a1 @@ -414,8 +414,7 @@ define i32 @sdiv_constant_srai(i32 %a) nounwind { ; RV64-NEXT: addi a1, a1, 1639 ; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: srai a0, a0, 32 -; RV64-NEXT: sraiw a0, a0, 1 +; RV64-NEXT: srai a0, a0, 33 ; RV64-NEXT: srliw a1, a0, 31 ; RV64-NEXT: addw a0, a0, a1 ; RV64-NEXT: ret @@ -656,8 +655,6 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV32IM-NEXT: srai a0, a0, 24 ; RV32IM-NEXT: mul a0, a0, a1 ; RV32IM-NEXT: slli a0, a0, 16 -; RV32IM-NEXT: srai a0, a0, 24 -; RV32IM-NEXT: slli a0, a0, 24 ; RV32IM-NEXT: srai a0, a0, 25 ; RV32IM-NEXT: zext.b a1, a0 ; RV32IM-NEXT: srli a1, a1, 7 @@ -670,9 +667,7 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV32IMZB-NEXT: sext.b a0, a0 ; RV32IMZB-NEXT: mul a0, a0, a1 ; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: srai a0, a0, 8 -; RV32IMZB-NEXT: sext.b a0, a0 -; RV32IMZB-NEXT: srai a0, a0, 1 +; RV32IMZB-NEXT: srai a0, a0, 9 ; RV32IMZB-NEXT: zext.b a1, a0 ; RV32IMZB-NEXT: srli a1, a1, 7 ; RV32IMZB-NEXT: add a0, a0, a1 @@ -685,8 +680,6 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV64IM-NEXT: srai a0, a0, 56 ; RV64IM-NEXT: mul a0, a0, a1 ; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 56 -; RV64IM-NEXT: slli a0, a0, 56 ; RV64IM-NEXT: srai a0, a0, 57 ; RV64IM-NEXT: zext.b a1, a0 ; RV64IM-NEXT: srli a1, a1, 7 @@ -699,9 +692,7 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind { ; RV64IMZB-NEXT: sext.b a0, a0 ; RV64IMZB-NEXT: mul a0, a0, a1 ; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: srai a0, a0, 8 -; RV64IMZB-NEXT: sext.b a0, a0 -; RV64IMZB-NEXT: srai a0, a0, 1 +; RV64IMZB-NEXT: srai a0, a0, 9 ; RV64IMZB-NEXT: zext.b a1, a0 ; RV64IMZB-NEXT: srli a1, a1, 7 ; RV64IMZB-NEXT: add a0, a0, a1 @@ -816,7 +807,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { ; RV64IM-NEXT: mul a1, a2, a1 ; RV64IM-NEXT: slli a1, a1, 48 ; RV64IM-NEXT: srai a1, a1, 56 -; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: sub a1, a1, a0 ; RV64IM-NEXT: slli a1, a1, 56 ; RV64IM-NEXT: srai a0, a1, 58 ; RV64IM-NEXT: zext.b a1, a0 @@ -906,8 +897,6 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV32IM-NEXT: addi a1, a1, 1639 ; RV32IM-NEXT: srai a0, a0, 16 ; RV32IM-NEXT: mul a0, a0, a1 -; RV32IM-NEXT: srai a0, a0, 16 -; RV32IM-NEXT: slli a0, a0, 16 ; RV32IM-NEXT: srai a0, a0, 17 ; RV32IM-NEXT: slli a1, a0, 16 ; RV32IM-NEXT: srli a1, a1, 16 @@ -921,9 +910,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV32IMZB-NEXT: addi a1, a1, 1639 ; RV32IMZB-NEXT: sext.h a0, a0 ; RV32IMZB-NEXT: mul a0, a0, a1 -; RV32IMZB-NEXT: srai a0, a0, 16 -; RV32IMZB-NEXT: sext.h a0, a0 -; RV32IMZB-NEXT: srai a0, a0, 1 +; RV32IMZB-NEXT: srai a0, a0, 17 ; RV32IMZB-NEXT: zext.h a1, a0 ; RV32IMZB-NEXT: srli a1, a1, 15 ; RV32IMZB-NEXT: add a0, a0, a1 @@ -936,9 +923,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV64IM-NEXT: addi a1, a1, 1639 ; RV64IM-NEXT: srai a0, a0, 48 ; RV64IM-NEXT: mul a0, a0, a1 -; RV64IM-NEXT: sraiw a0, a0, 16 -; RV64IM-NEXT: slli a0, a0, 48 -; RV64IM-NEXT: srai a0, a0, 49 +; RV64IM-NEXT: sraiw a0, a0, 17 ; RV64IM-NEXT: slli a1, a0, 48 ; RV64IM-NEXT: srli a1, a1, 48 ; RV64IM-NEXT: srli a1, a1, 15 @@ -951,9 +936,7 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind { ; RV64IMZB-NEXT: addi a1, a1, 1639 ; RV64IMZB-NEXT: sext.h a0, a0 ; RV64IMZB-NEXT: mul a0, a0, a1 -; RV64IMZB-NEXT: sraiw a0, a0, 16 -; RV64IMZB-NEXT: sext.h a0, a0 -; RV64IMZB-NEXT: srai a0, a0, 1 +; RV64IMZB-NEXT: sraiw a0, a0, 17 ; RV64IMZB-NEXT: zext.h a1, a0 ; RV64IMZB-NEXT: srli a1, a1, 15 ; RV64IMZB-NEXT: add a0, a0, a1 @@ -1071,7 +1054,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { ; RV64IM-NEXT: srai a2, a2, 48 ; RV64IM-NEXT: mul a1, a2, a1 ; RV64IM-NEXT: sraiw a1, a1, 16 -; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: sub a1, a1, a0 ; RV64IM-NEXT: slli a1, a1, 48 ; RV64IM-NEXT: srai a0, a1, 51 ; RV64IM-NEXT: slli a1, a0, 48 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll index a49e94f..620c5ec 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-convert.ll @@ -246,17 +246,11 @@ define double @fcvt_d_wu(i32 %a) nounwind { } define double @fcvt_d_wu_load(ptr %p) nounwind { -; RV32IFD-LABEL: fcvt_d_wu_load: -; RV32IFD: # %bb.0: -; RV32IFD-NEXT: lw a0, 0(a0) -; RV32IFD-NEXT: fcvt.d.wu fa0, a0 -; RV32IFD-NEXT: ret -; -; RV64IFD-LABEL: fcvt_d_wu_load: -; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lwu a0, 0(a0) -; RV64IFD-NEXT: fcvt.d.wu fa0, a0 -; RV64IFD-NEXT: ret +; CHECKIFD-LABEL: fcvt_d_wu_load: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: lw a0, 0(a0) +; CHECKIFD-NEXT: fcvt.d.wu fa0, a0 +; CHECKIFD-NEXT: ret ; ; RV32I-LABEL: fcvt_d_wu_load: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll index fa09362..bbea792 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-convert.ll @@ -232,17 +232,11 @@ define float @fcvt_s_wu(i32 %a) nounwind { } define float @fcvt_s_wu_load(ptr %p) nounwind { -; RV32IF-LABEL: fcvt_s_wu_load: -; RV32IF: # %bb.0: -; RV32IF-NEXT: lw a0, 0(a0) -; RV32IF-NEXT: fcvt.s.wu fa0, a0 -; RV32IF-NEXT: ret -; -; RV64IF-LABEL: fcvt_s_wu_load: -; RV64IF: # %bb.0: -; RV64IF-NEXT: lwu a0, 0(a0) -; RV64IF-NEXT: fcvt.s.wu fa0, a0 -; RV64IF-NEXT: ret +; CHECKIF-LABEL: fcvt_s_wu_load: +; CHECKIF: # %bb.0: +; CHECKIF-NEXT: lw a0, 0(a0) +; CHECKIF-NEXT: fcvt.s.wu fa0, a0 +; CHECKIF-NEXT: ret ; ; RV32I-LABEL: fcvt_s_wu_load: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir index 78a2227b..a7c1c63 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-abs-rv64.mir @@ -88,8 +88,7 @@ body: | ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASSERT_SEXT]], [[ASHR]] ; RV64I-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32 ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[SEXT_INREG]], [[ASHR]] - ; RV64I-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32 - ; RV64I-NEXT: $x10 = COPY [[SEXT_INREG1]](s64) + ; RV64I-NEXT: $x10 = COPY [[XOR]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 ; ; RV64ZBB-LABEL: name: abs_i32 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll index 8a786fc..46d1661 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rotl-rotr.ll @@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -55,7 +55,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: sllw a1, a0, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -104,7 +104,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: srlw a1, a0, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -167,7 +167,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotl_64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -276,7 +276,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: sll a1, a0, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -340,7 +340,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotr_64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -451,7 +451,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: srl a1, a0, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -474,7 +474,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_32_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -490,7 +490,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64ZBB-LABEL: rotl_32_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: neg a2, a1 ; RV64ZBB-NEXT: sllw a1, a0, a1 ; RV64ZBB-NEXT: srlw a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -506,7 +506,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_32_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: sllw a1, a0, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -531,7 +531,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64I-LABEL: rotl_32_mask_and_63_and_31: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -547,7 +547,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sllw a2, a0, a1 -; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: neg a1, a1 ; RV64ZBB-NEXT: srlw a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -563,7 +563,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -632,7 +632,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_32_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -648,7 +648,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64ZBB-LABEL: rotr_32_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: neg a2, a1 ; RV64ZBB-NEXT: srlw a1, a0, a1 ; RV64ZBB-NEXT: sllw a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -664,7 +664,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_32_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: srlw a1, a0, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -689,7 +689,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64I-LABEL: rotr_32_mask_and_63_and_31: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -705,7 +705,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: srlw a2, a0, a1 -; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: neg a1, a1 ; RV64ZBB-NEXT: sllw a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -721,7 +721,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -829,7 +829,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotl_64_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -884,7 +884,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64ZBB-LABEL: rotl_64_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: neg a2, a1 ; RV64ZBB-NEXT: sll a1, a0, a1 ; RV64ZBB-NEXT: srl a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -939,7 +939,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotl_64_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: sll a1, a0, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -1005,7 +1005,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64I-LABEL: rotl_64_mask_and_127_and_63: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -1062,7 +1062,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64ZBB-LABEL: rotl_64_mask_and_127_and_63: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: sll a2, a0, a1 -; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: neg a1, a1 ; RV64ZBB-NEXT: srl a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -1119,7 +1119,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1277,7 +1277,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotr_64_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -1331,7 +1331,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64ZBB-LABEL: rotr_64_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: neg a2, a1 ; RV64ZBB-NEXT: srl a1, a0, a1 ; RV64ZBB-NEXT: sll a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -1385,7 +1385,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64XTHEADBB-LABEL: rotr_64_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: neg a2, a1 ; RV64XTHEADBB-NEXT: srl a1, a0, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -1451,7 +1451,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64I-LABEL: rotr_64_mask_and_127_and_63: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -1508,7 +1508,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64ZBB-LABEL: rotr_64_mask_and_127_and_63: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: srl a2, a0, a1 -; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: neg a1, a1 ; RV64ZBB-NEXT: sll a0, a0, a1 ; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -1565,7 +1565,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1701,7 +1701,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: sllw a4, a0, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: srlw a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sllw a1, a1, a2 @@ -1737,7 +1737,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: sllw a4, a0, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: srlw a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sllw a1, a1, a2 @@ -1822,7 +1822,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: sll a4, a0, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: srl a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sll a1, a1, a2 @@ -1972,7 +1972,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: sll a4, a0, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: srl a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sll a1, a1, a2 @@ -2002,7 +2002,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: srlw a4, a0, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: sllw a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sllw a1, a1, a2 @@ -2038,7 +2038,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: srlw a4, a0, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: sllw a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sllw a1, a1, a2 @@ -2125,7 +2125,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64I: # %bb.0: ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: srl a4, a0, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: sll a0, a0, a3 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: sll a1, a1, a2 @@ -2279,7 +2279,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: srl a4, a0, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: sll a0, a0, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 ; RV64XTHEADBB-NEXT: sll a1, a1, a2 @@ -2312,8 +2312,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: sllw a4, a0, a2 ; RV64I-NEXT: sllw a2, a1, a2 -; RV64I-NEXT: negw a5, a3 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a5, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: srlw a0, a0, a5 ; RV64I-NEXT: srlw a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -2353,8 +2353,8 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: sllw a4, a0, a2 ; RV64XTHEADBB-NEXT: sllw a2, a1, a2 -; RV64XTHEADBB-NEXT: negw a5, a3 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a5, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: srlw a0, a0, a5 ; RV64XTHEADBB-NEXT: srlw a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -2464,7 +2464,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: sll a4, a0, a2 ; RV64I-NEXT: sll a2, a1, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: srl a0, a0, a3 ; RV64I-NEXT: srl a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -2664,7 +2664,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: sll a4, a0, a2 ; RV64XTHEADBB-NEXT: sll a2, a1, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: srl a0, a0, a3 ; RV64XTHEADBB-NEXT: srl a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -2697,8 +2697,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I-NEXT: andi a3, a2, 31 ; RV64I-NEXT: srlw a4, a0, a2 ; RV64I-NEXT: srlw a2, a1, a2 -; RV64I-NEXT: negw a5, a3 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a5, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: sllw a0, a0, a5 ; RV64I-NEXT: sllw a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -2738,8 +2738,8 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB-NEXT: andi a3, a2, 31 ; RV64XTHEADBB-NEXT: srlw a4, a0, a2 ; RV64XTHEADBB-NEXT: srlw a2, a1, a2 -; RV64XTHEADBB-NEXT: negw a5, a3 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a5, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: sllw a0, a0, a5 ; RV64XTHEADBB-NEXT: sllw a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -2850,7 +2850,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I-NEXT: andi a3, a2, 63 ; RV64I-NEXT: srl a4, a0, a2 ; RV64I-NEXT: srl a2, a1, a2 -; RV64I-NEXT: negw a3, a3 +; RV64I-NEXT: neg a3, a3 ; RV64I-NEXT: sll a0, a0, a3 ; RV64I-NEXT: sll a1, a1, a3 ; RV64I-NEXT: or a0, a4, a0 @@ -3052,7 +3052,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB-NEXT: andi a3, a2, 63 ; RV64XTHEADBB-NEXT: srl a4, a0, a2 ; RV64XTHEADBB-NEXT: srl a2, a1, a2 -; RV64XTHEADBB-NEXT: negw a3, a3 +; RV64XTHEADBB-NEXT: neg a3, a3 ; RV64XTHEADBB-NEXT: sll a0, a0, a3 ; RV64XTHEADBB-NEXT: sll a1, a1, a3 ; RV64XTHEADBB-NEXT: or a0, a4, a0 @@ -3116,7 +3116,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64I-LABEL: rotl_64_zext: ; RV64I: # %bb.0: ; RV64I-NEXT: li a2, 64 -; RV64I-NEXT: subw a2, a2, a1 +; RV64I-NEXT: sub a2, a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -3171,7 +3171,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotl_64_zext: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: li a2, 64 -; RV64ZBB-NEXT: subw a2, a2, a1 +; RV64ZBB-NEXT: sub a2, a2, a1 ; RV64ZBB-NEXT: sll a1, a0, a1 ; RV64ZBB-NEXT: srl a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -3226,7 +3226,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_zext: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: li a2, 64 -; RV64XTHEADBB-NEXT: subw a2, a2, a1 +; RV64XTHEADBB-NEXT: sub a2, a2, a1 ; RV64XTHEADBB-NEXT: sll a1, a0, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 @@ -3289,7 +3289,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64I-LABEL: rotr_64_zext: ; RV64I: # %bb.0: ; RV64I-NEXT: li a2, 64 -; RV64I-NEXT: subw a2, a2, a1 +; RV64I-NEXT: sub a2, a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -3343,7 +3343,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64ZBB-LABEL: rotr_64_zext: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: li a2, 64 -; RV64ZBB-NEXT: subw a2, a2, a1 +; RV64ZBB-NEXT: sub a2, a2, a1 ; RV64ZBB-NEXT: srl a1, a0, a1 ; RV64ZBB-NEXT: sll a0, a0, a2 ; RV64ZBB-NEXT: or a0, a1, a0 @@ -3397,7 +3397,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_zext: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: li a2, 64 -; RV64XTHEADBB-NEXT: subw a2, a2, a1 +; RV64XTHEADBB-NEXT: sub a2, a2, a1 ; RV64XTHEADBB-NEXT: srl a1, a0, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a2 ; RV64XTHEADBB-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll index 1eddb8f..b7f84ba 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll @@ -107,7 +107,7 @@ declare i32 @llvm.fshl.i32(i32, i32, i32) define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: rol_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -125,7 +125,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind { define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind { ; RV64I-LABEL: rol_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a3, a1 +; RV64I-NEXT: neg a3, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a3 ; RV64I-NEXT: or a0, a1, a0 @@ -146,7 +146,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: rol_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: negw a2, a0 +; RV64I-NEXT: neg a2, a0 ; RV64I-NEXT: sllw a0, a1, a0 ; RV64I-NEXT: srlw a1, a1, a2 ; RV64I-NEXT: or a0, a0, a1 @@ -166,7 +166,7 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: rol_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -185,7 +185,7 @@ declare i32 @llvm.fshr.i32(i32, i32, i32) define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: ror_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -203,7 +203,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind { define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind { ; RV64I-LABEL: ror_i32_nosext: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a3, a1 +; RV64I-NEXT: neg a3, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a3 ; RV64I-NEXT: or a0, a1, a0 @@ -224,7 +224,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: ror_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: negw a2, a0 +; RV64I-NEXT: neg a2, a0 ; RV64I-NEXT: srlw a0, a1, a0 ; RV64I-NEXT: sllw a1, a1, a2 ; RV64I-NEXT: or a0, a0, a1 @@ -244,7 +244,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: ror_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll index 9690302..2dd3bb3 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll @@ -31,7 +31,7 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -88,7 +88,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -103,7 +103,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 ; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: subw a0, a1, a0 +; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: j .LBB1_3 ; RV64I-NEXT: .LBB1_2: ; RV64I-NEXT: li a0, 32 @@ -153,7 +153,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -168,7 +168,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-NEXT: call __muldi3 ; RV64I-NEXT: srliw a0, a0, 24 ; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: subw a1, a1, a0 +; RV64I-NEXT: sub a1, a1, a0 ; RV64I-NEXT: .LBB2_2: # %cond.end ; RV64I-NEXT: subw a0, s0, a1 ; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -212,7 +212,7 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -283,7 +283,7 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -412,7 +412,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -455,7 +455,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -497,7 +497,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -553,7 +553,7 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -672,7 +672,7 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind { ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -728,7 +728,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; RV64I-NEXT: and a1, a2, a1 ; RV64I-NEXT: lui a2, 209715 ; RV64I-NEXT: addi a2, a2, 819 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: and a1, a1, a2 @@ -748,7 +748,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; ; RV64ZBB-LABEL: ctpop_i32_load: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: lwu a0, 0(a0) +; RV64ZBB-NEXT: lw a0, 0(a0) ; RV64ZBB-NEXT: cpopw a0, a0 ; RV64ZBB-NEXT: ret %a = load i32, ptr %p @@ -1053,9 +1053,8 @@ define signext i32 @abs_i32_sext(i32 signext %x) { ; RV64I-LABEL: abs_i32_sext: ; RV64I: # %bb.0: ; RV64I-NEXT: srai a1, a0, 31 -; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: addw a0, a0, a1 ; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: abs_i32_sext: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll index cd59c9e..ba058ca 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll @@ -114,7 +114,7 @@ define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind { define i64 @pack_i64_3(ptr %0, ptr %1) { ; RV64I-LABEL: pack_i64_3: ; RV64I: # %bb.0: -; RV64I-NEXT: lwu a0, 0(a0) +; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: lwu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a1 @@ -122,8 +122,8 @@ define i64 @pack_i64_3(ptr %0, ptr %1) { ; ; RV64ZBKB-LABEL: pack_i64_3: ; RV64ZBKB: # %bb.0: -; RV64ZBKB-NEXT: lwu a0, 0(a0) -; RV64ZBKB-NEXT: lwu a1, 0(a1) +; RV64ZBKB-NEXT: lw a0, 0(a0) +; RV64ZBKB-NEXT: lw a1, 0(a1) ; RV64ZBKB-NEXT: pack a0, a1, a0 ; RV64ZBKB-NEXT: ret %3 = load i32, ptr %0, align 4 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll index 8b262db..d634cc9 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/shifts.ll @@ -330,13 +330,13 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: li a3, 64 ; RV64I-NEXT: bltu a2, a3, .LBB6_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a4, a2, a3 +; RV64I-NEXT: sub a4, a2, a3 ; RV64I-NEXT: srl a4, a1, a4 ; RV64I-NEXT: bnez a2, .LBB6_3 ; RV64I-NEXT: j .LBB6_4 ; RV64I-NEXT: .LBB6_2: ; RV64I-NEXT: srl a4, a0, a2 -; RV64I-NEXT: negw a5, a2 +; RV64I-NEXT: neg a5, a2 ; RV64I-NEXT: sll a5, a1, a5 ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: beqz a2, .LBB6_4 @@ -476,13 +476,13 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: li a3, 64 ; RV64I-NEXT: bltu a2, a3, .LBB7_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a4, a2, a3 +; RV64I-NEXT: sub a4, a2, a3 ; RV64I-NEXT: sra a4, a1, a4 ; RV64I-NEXT: bnez a2, .LBB7_3 ; RV64I-NEXT: j .LBB7_4 ; RV64I-NEXT: .LBB7_2: ; RV64I-NEXT: srl a4, a0, a2 -; RV64I-NEXT: negw a5, a2 +; RV64I-NEXT: neg a5, a2 ; RV64I-NEXT: sll a5, a1, a5 ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: beqz a2, .LBB7_4 @@ -615,13 +615,13 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: bltu a2, a4, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a0, 0 -; RV64I-NEXT: subw a4, a2, a4 +; RV64I-NEXT: sub a4, a2, a4 ; RV64I-NEXT: sll a3, a3, a4 ; RV64I-NEXT: bnez a2, .LBB8_3 ; RV64I-NEXT: j .LBB8_4 ; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: sll a0, a3, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: srl a3, a3, a4 ; RV64I-NEXT: sll a4, a1, a2 ; RV64I-NEXT: or a3, a3, a4 @@ -685,7 +685,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { ; ; RV64I-LABEL: fshr64_minsize: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -914,12 +914,12 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV64I-NEXT: li a4, 64 ; RV64I-NEXT: bltu a5, a4, .LBB10_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a3, a5, a4 +; RV64I-NEXT: sub a3, a5, a4 ; RV64I-NEXT: srl a6, a1, a3 ; RV64I-NEXT: j .LBB10_3 ; RV64I-NEXT: .LBB10_2: ; RV64I-NEXT: srl a3, a0, a2 -; RV64I-NEXT: negw a6, a5 +; RV64I-NEXT: neg a6, a5 ; RV64I-NEXT: sll a6, a1, a6 ; RV64I-NEXT: or a6, a3, a6 ; RV64I-NEXT: .LBB10_3: @@ -928,7 +928,7 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV64I-NEXT: # %bb.4: ; RV64I-NEXT: mv a3, a6 ; RV64I-NEXT: .LBB10_5: -; RV64I-NEXT: negw a7, a2 +; RV64I-NEXT: neg a7, a2 ; RV64I-NEXT: bltu a5, a4, .LBB10_7 ; RV64I-NEXT: # %bb.6: ; RV64I-NEXT: li a2, 0 @@ -940,13 +940,13 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind { ; RV64I-NEXT: bltu a6, a4, .LBB10_10 ; RV64I-NEXT: # %bb.9: ; RV64I-NEXT: li a5, 0 -; RV64I-NEXT: subw a4, a6, a4 +; RV64I-NEXT: sub a4, a6, a4 ; RV64I-NEXT: sll a0, a0, a4 ; RV64I-NEXT: bnez a6, .LBB10_11 ; RV64I-NEXT: j .LBB10_12 ; RV64I-NEXT: .LBB10_10: ; RV64I-NEXT: sll a5, a0, a7 -; RV64I-NEXT: negw a4, a6 +; RV64I-NEXT: neg a4, a6 ; RV64I-NEXT: srl a0, a0, a4 ; RV64I-NEXT: sll a4, a1, a7 ; RV64I-NEXT: or a0, a0, a4 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll index 69519c0..014b1c1 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -758,13 +758,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB6_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 +; RV64I-NEXT: sub a5, a1, a4 ; RV64I-NEXT: srl a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB6_3 ; RV64I-NEXT: j .LBB6_4 ; RV64I-NEXT: .LBB6_2: ; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: neg a6, a1 ; RV64I-NEXT: sll a6, a3, a6 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: beqz a1, .LBB6_4 @@ -1091,13 +1091,13 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB7_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 +; RV64I-NEXT: sub a5, a1, a4 ; RV64I-NEXT: srl a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB7_3 ; RV64I-NEXT: j .LBB7_4 ; RV64I-NEXT: .LBB7_2: ; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: neg a6, a1 ; RV64I-NEXT: sll a6, a3, a6 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: beqz a1, .LBB7_4 @@ -1425,13 +1425,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu a3, a5, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a1, 0 -; RV64I-NEXT: subw a5, a3, a5 +; RV64I-NEXT: sub a5, a3, a5 ; RV64I-NEXT: sll a4, a4, a5 ; RV64I-NEXT: bnez a3, .LBB8_3 ; RV64I-NEXT: j .LBB8_4 ; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: sll a1, a4, a3 -; RV64I-NEXT: negw a5, a3 +; RV64I-NEXT: neg a5, a3 ; RV64I-NEXT: srl a4, a4, a5 ; RV64I-NEXT: sll a5, a0, a3 ; RV64I-NEXT: or a4, a4, a5 @@ -1754,13 +1754,13 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: bltu a3, a5, .LBB9_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a1, 0 -; RV64I-NEXT: subw a5, a3, a5 +; RV64I-NEXT: sub a5, a3, a5 ; RV64I-NEXT: sll a4, a4, a5 ; RV64I-NEXT: bnez a3, .LBB9_3 ; RV64I-NEXT: j .LBB9_4 ; RV64I-NEXT: .LBB9_2: ; RV64I-NEXT: sll a1, a4, a3 -; RV64I-NEXT: negw a5, a3 +; RV64I-NEXT: neg a5, a3 ; RV64I-NEXT: srl a4, a4, a5 ; RV64I-NEXT: sll a5, a0, a3 ; RV64I-NEXT: or a4, a4, a5 @@ -2083,13 +2083,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB10_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 +; RV64I-NEXT: sub a5, a1, a4 ; RV64I-NEXT: sra a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB10_3 ; RV64I-NEXT: j .LBB10_4 ; RV64I-NEXT: .LBB10_2: ; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: neg a6, a1 ; RV64I-NEXT: sll a6, a3, a6 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: beqz a1, .LBB10_4 @@ -2416,13 +2416,13 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a3, a6, a7 ; RV64I-NEXT: bltu a1, a4, .LBB11_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: subw a5, a1, a4 +; RV64I-NEXT: sub a5, a1, a4 ; RV64I-NEXT: sra a5, a3, a5 ; RV64I-NEXT: bnez a1, .LBB11_3 ; RV64I-NEXT: j .LBB11_4 ; RV64I-NEXT: .LBB11_2: ; RV64I-NEXT: srl a5, a0, a1 -; RV64I-NEXT: negw a6, a1 +; RV64I-NEXT: neg a6, a1 ; RV64I-NEXT: sll a6, a3, a6 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: beqz a1, .LBB11_4 @@ -2796,8 +2796,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or t0, t5, t3 ; RV64I-NEXT: or a5, s0, t6 ; RV64I-NEXT: slli a5, a5, 3 -; RV64I-NEXT: subw t1, a5, a7 -; RV64I-NEXT: negw t5, a5 +; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: neg t5, a5 ; RV64I-NEXT: sll t3, t0, t5 ; RV64I-NEXT: bltu a5, a7, .LBB12_2 ; RV64I-NEXT: # %bb.1: @@ -2842,7 +2842,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bgeu t6, a7, .LBB12_14 ; RV64I-NEXT: .LBB12_12: ; RV64I-NEXT: sll t5, a6, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a6, s0 ; RV64I-NEXT: or s1, s0, t3 ; RV64I-NEXT: j .LBB12_15 @@ -2851,7 +2851,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu t6, a7, .LBB12_12 ; RV64I-NEXT: .LBB12_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t3, t6, a7 +; RV64I-NEXT: sub t3, t6, a7 ; RV64I-NEXT: sll s1, a6, t3 ; RV64I-NEXT: .LBB12_15: ; RV64I-NEXT: sub s0, a5, t1 @@ -2862,13 +2862,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: .LBB12_17: ; RV64I-NEXT: bltu s0, a7, .LBB12_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, a7 +; RV64I-NEXT: sub t6, s0, a7 ; RV64I-NEXT: srl t6, t0, t6 ; RV64I-NEXT: bnez s0, .LBB12_20 ; RV64I-NEXT: j .LBB12_21 ; RV64I-NEXT: .LBB12_19: ; RV64I-NEXT: srl t6, a6, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, t0, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB12_21 @@ -3720,8 +3720,8 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or t0, t5, t3 ; RV64I-NEXT: or a5, s0, t6 ; RV64I-NEXT: slli a5, a5, 5 -; RV64I-NEXT: subw t1, a5, a7 -; RV64I-NEXT: negw t5, a5 +; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: neg t5, a5 ; RV64I-NEXT: sll t3, t0, t5 ; RV64I-NEXT: bltu a5, a7, .LBB13_2 ; RV64I-NEXT: # %bb.1: @@ -3766,7 +3766,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: bgeu t6, a7, .LBB13_14 ; RV64I-NEXT: .LBB13_12: ; RV64I-NEXT: sll t5, a6, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a6, s0 ; RV64I-NEXT: or s1, s0, t3 ; RV64I-NEXT: j .LBB13_15 @@ -3775,7 +3775,7 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: bltu t6, a7, .LBB13_12 ; RV64I-NEXT: .LBB13_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t3, t6, a7 +; RV64I-NEXT: sub t3, t6, a7 ; RV64I-NEXT: sll s1, a6, t3 ; RV64I-NEXT: .LBB13_15: ; RV64I-NEXT: sub s0, a5, t1 @@ -3786,13 +3786,13 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: .LBB13_17: ; RV64I-NEXT: bltu s0, a7, .LBB13_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, a7 +; RV64I-NEXT: sub t6, s0, a7 ; RV64I-NEXT: srl t6, t0, t6 ; RV64I-NEXT: bnez s0, .LBB13_20 ; RV64I-NEXT: j .LBB13_21 ; RV64I-NEXT: .LBB13_19: ; RV64I-NEXT: srl t6, a6, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, t0, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB13_21 @@ -4644,8 +4644,8 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: or t0, t5, t3 ; RV64I-NEXT: or a5, s0, t6 ; RV64I-NEXT: slli a5, a5, 6 -; RV64I-NEXT: subw t1, a5, a7 -; RV64I-NEXT: negw t5, a5 +; RV64I-NEXT: sub t1, a5, a7 +; RV64I-NEXT: neg t5, a5 ; RV64I-NEXT: sll t3, t0, t5 ; RV64I-NEXT: bltu a5, a7, .LBB14_2 ; RV64I-NEXT: # %bb.1: @@ -4690,7 +4690,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: bgeu t6, a7, .LBB14_14 ; RV64I-NEXT: .LBB14_12: ; RV64I-NEXT: sll t5, a6, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a6, s0 ; RV64I-NEXT: or s1, s0, t3 ; RV64I-NEXT: j .LBB14_15 @@ -4699,7 +4699,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: bltu t6, a7, .LBB14_12 ; RV64I-NEXT: .LBB14_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t3, t6, a7 +; RV64I-NEXT: sub t3, t6, a7 ; RV64I-NEXT: sll s1, a6, t3 ; RV64I-NEXT: .LBB14_15: ; RV64I-NEXT: sub s0, a5, t1 @@ -4710,13 +4710,13 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: .LBB14_17: ; RV64I-NEXT: bltu s0, a7, .LBB14_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, a7 +; RV64I-NEXT: sub t6, s0, a7 ; RV64I-NEXT: srl t6, t0, t6 ; RV64I-NEXT: bnez s0, .LBB14_20 ; RV64I-NEXT: j .LBB14_21 ; RV64I-NEXT: .LBB14_19: ; RV64I-NEXT: srl t6, a6, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, t0, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB14_21 @@ -5542,8 +5542,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a5, s0, a6 ; RV64I-NEXT: or a6, a1, s5 ; RV64I-NEXT: slli a6, a6, 3 -; RV64I-NEXT: subw t2, a6, t0 -; RV64I-NEXT: negw t3, a6 +; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: neg t3, a6 ; RV64I-NEXT: srl s0, t1, t3 ; RV64I-NEXT: bltu a6, t0, .LBB15_2 ; RV64I-NEXT: # %bb.1: @@ -5585,11 +5585,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli s4, s9, 16 ; RV64I-NEXT: bltu a4, t0, .LBB15_7 ; RV64I-NEXT: # %bb.6: -; RV64I-NEXT: subw s0, a4, t0 +; RV64I-NEXT: sub s0, a4, t0 ; RV64I-NEXT: srl s0, a5, s0 ; RV64I-NEXT: j .LBB15_8 ; RV64I-NEXT: .LBB15_7: -; RV64I-NEXT: negw s6, a4 +; RV64I-NEXT: neg s6, a4 ; RV64I-NEXT: sll s6, a5, s6 ; RV64I-NEXT: or s0, s0, s6 ; RV64I-NEXT: .LBB15_8: @@ -5637,13 +5637,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu s0, t0, .LBB15_20 ; RV64I-NEXT: # %bb.19: ; RV64I-NEXT: li t2, 0 -; RV64I-NEXT: subw t0, s0, t0 +; RV64I-NEXT: sub t0, s0, t0 ; RV64I-NEXT: sll t0, t1, t0 ; RV64I-NEXT: bnez s0, .LBB15_21 ; RV64I-NEXT: j .LBB15_22 ; RV64I-NEXT: .LBB15_20: ; RV64I-NEXT: sll t2, t1, s0 -; RV64I-NEXT: negw t0, s0 +; RV64I-NEXT: neg t0, s0 ; RV64I-NEXT: srl t0, t1, t0 ; RV64I-NEXT: sll t1, a5, s0 ; RV64I-NEXT: or t0, t0, t1 @@ -6456,8 +6456,8 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: or a5, s0, a6 ; RV64I-NEXT: or a6, a1, s5 ; RV64I-NEXT: slli a6, a6, 5 -; RV64I-NEXT: subw t2, a6, t0 -; RV64I-NEXT: negw t3, a6 +; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: neg t3, a6 ; RV64I-NEXT: srl s0, t1, t3 ; RV64I-NEXT: bltu a6, t0, .LBB16_2 ; RV64I-NEXT: # %bb.1: @@ -6499,11 +6499,11 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: slli s4, s9, 16 ; RV64I-NEXT: bltu a4, t0, .LBB16_7 ; RV64I-NEXT: # %bb.6: -; RV64I-NEXT: subw s0, a4, t0 +; RV64I-NEXT: sub s0, a4, t0 ; RV64I-NEXT: srl s0, a5, s0 ; RV64I-NEXT: j .LBB16_8 ; RV64I-NEXT: .LBB16_7: -; RV64I-NEXT: negw s6, a4 +; RV64I-NEXT: neg s6, a4 ; RV64I-NEXT: sll s6, a5, s6 ; RV64I-NEXT: or s0, s0, s6 ; RV64I-NEXT: .LBB16_8: @@ -6551,13 +6551,13 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw ; RV64I-NEXT: bltu s0, t0, .LBB16_20 ; RV64I-NEXT: # %bb.19: ; RV64I-NEXT: li t2, 0 -; RV64I-NEXT: subw t0, s0, t0 +; RV64I-NEXT: sub t0, s0, t0 ; RV64I-NEXT: sll t0, t1, t0 ; RV64I-NEXT: bnez s0, .LBB16_21 ; RV64I-NEXT: j .LBB16_22 ; RV64I-NEXT: .LBB16_20: ; RV64I-NEXT: sll t2, t1, s0 -; RV64I-NEXT: negw t0, s0 +; RV64I-NEXT: neg t0, s0 ; RV64I-NEXT: srl t0, t1, t0 ; RV64I-NEXT: sll t1, a5, s0 ; RV64I-NEXT: or t0, t0, t1 @@ -7370,8 +7370,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: or a5, s0, a6 ; RV64I-NEXT: or a6, a1, s5 ; RV64I-NEXT: slli a6, a6, 6 -; RV64I-NEXT: subw t2, a6, t0 -; RV64I-NEXT: negw t3, a6 +; RV64I-NEXT: sub t2, a6, t0 +; RV64I-NEXT: neg t3, a6 ; RV64I-NEXT: srl s0, t1, t3 ; RV64I-NEXT: bltu a6, t0, .LBB17_2 ; RV64I-NEXT: # %bb.1: @@ -7413,11 +7413,11 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: slli s4, s9, 16 ; RV64I-NEXT: bltu a4, t0, .LBB17_7 ; RV64I-NEXT: # %bb.6: -; RV64I-NEXT: subw s0, a4, t0 +; RV64I-NEXT: sub s0, a4, t0 ; RV64I-NEXT: srl s0, a5, s0 ; RV64I-NEXT: j .LBB17_8 ; RV64I-NEXT: .LBB17_7: -; RV64I-NEXT: negw s6, a4 +; RV64I-NEXT: neg s6, a4 ; RV64I-NEXT: sll s6, a5, s6 ; RV64I-NEXT: or s0, s0, s6 ; RV64I-NEXT: .LBB17_8: @@ -7465,13 +7465,13 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; RV64I-NEXT: bltu s0, t0, .LBB17_20 ; RV64I-NEXT: # %bb.19: ; RV64I-NEXT: li t2, 0 -; RV64I-NEXT: subw t0, s0, t0 +; RV64I-NEXT: sub t0, s0, t0 ; RV64I-NEXT: sll t0, t1, t0 ; RV64I-NEXT: bnez s0, .LBB17_21 ; RV64I-NEXT: j .LBB17_22 ; RV64I-NEXT: .LBB17_20: ; RV64I-NEXT: sll t2, t1, s0 -; RV64I-NEXT: negw t0, s0 +; RV64I-NEXT: neg t0, s0 ; RV64I-NEXT: srl t0, t1, t0 ; RV64I-NEXT: sll t1, a5, s0 ; RV64I-NEXT: or t0, t0, t1 @@ -8310,8 +8310,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a5, t5, t4 ; RV64I-NEXT: or a6, s0, t6 ; RV64I-NEXT: slli a6, a6, 3 -; RV64I-NEXT: subw t1, a6, t0 -; RV64I-NEXT: negw t5, a6 +; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: neg t5, a6 ; RV64I-NEXT: sll t4, a5, t5 ; RV64I-NEXT: bltu a6, t0, .LBB18_2 ; RV64I-NEXT: # %bb.1: @@ -8356,7 +8356,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bgeu t6, t0, .LBB18_14 ; RV64I-NEXT: .LBB18_12: ; RV64I-NEXT: sll t5, a7, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a7, s0 ; RV64I-NEXT: or s1, s0, t4 ; RV64I-NEXT: j .LBB18_15 @@ -8365,7 +8365,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: bltu t6, t0, .LBB18_12 ; RV64I-NEXT: .LBB18_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t4, t6, t0 +; RV64I-NEXT: sub t4, t6, t0 ; RV64I-NEXT: sll s1, a7, t4 ; RV64I-NEXT: .LBB18_15: ; RV64I-NEXT: sub s0, a6, t1 @@ -8376,13 +8376,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: .LBB18_17: ; RV64I-NEXT: bltu s0, t0, .LBB18_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, t0 +; RV64I-NEXT: sub t6, s0, t0 ; RV64I-NEXT: sra t6, a5, t6 ; RV64I-NEXT: bnez s0, .LBB18_20 ; RV64I-NEXT: j .LBB18_21 ; RV64I-NEXT: .LBB18_19: ; RV64I-NEXT: srl t6, a7, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, a5, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB18_21 @@ -9241,8 +9241,8 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: or a5, t5, t4 ; RV64I-NEXT: or a6, s0, t6 ; RV64I-NEXT: slli a6, a6, 5 -; RV64I-NEXT: subw t1, a6, t0 -; RV64I-NEXT: negw t5, a6 +; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: neg t5, a6 ; RV64I-NEXT: sll t4, a5, t5 ; RV64I-NEXT: bltu a6, t0, .LBB19_2 ; RV64I-NEXT: # %bb.1: @@ -9287,7 +9287,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: bgeu t6, t0, .LBB19_14 ; RV64I-NEXT: .LBB19_12: ; RV64I-NEXT: sll t5, a7, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a7, s0 ; RV64I-NEXT: or s1, s0, t4 ; RV64I-NEXT: j .LBB19_15 @@ -9296,7 +9296,7 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: bltu t6, t0, .LBB19_12 ; RV64I-NEXT: .LBB19_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t4, t6, t0 +; RV64I-NEXT: sub t4, t6, t0 ; RV64I-NEXT: sll s1, a7, t4 ; RV64I-NEXT: .LBB19_15: ; RV64I-NEXT: sub s0, a6, t1 @@ -9307,13 +9307,13 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun ; RV64I-NEXT: .LBB19_17: ; RV64I-NEXT: bltu s0, t0, .LBB19_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, t0 +; RV64I-NEXT: sub t6, s0, t0 ; RV64I-NEXT: sra t6, a5, t6 ; RV64I-NEXT: bnez s0, .LBB19_20 ; RV64I-NEXT: j .LBB19_21 ; RV64I-NEXT: .LBB19_19: ; RV64I-NEXT: srl t6, a7, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, a5, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB19_21 @@ -10172,8 +10172,8 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: or a5, t5, t4 ; RV64I-NEXT: or a6, s0, t6 ; RV64I-NEXT: slli a6, a6, 6 -; RV64I-NEXT: subw t1, a6, t0 -; RV64I-NEXT: negw t5, a6 +; RV64I-NEXT: sub t1, a6, t0 +; RV64I-NEXT: neg t5, a6 ; RV64I-NEXT: sll t4, a5, t5 ; RV64I-NEXT: bltu a6, t0, .LBB20_2 ; RV64I-NEXT: # %bb.1: @@ -10218,7 +10218,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: bgeu t6, t0, .LBB20_14 ; RV64I-NEXT: .LBB20_12: ; RV64I-NEXT: sll t5, a7, t5 -; RV64I-NEXT: negw s0, t6 +; RV64I-NEXT: neg s0, t6 ; RV64I-NEXT: srl s0, a7, s0 ; RV64I-NEXT: or s1, s0, t4 ; RV64I-NEXT: j .LBB20_15 @@ -10227,7 +10227,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: bltu t6, t0, .LBB20_12 ; RV64I-NEXT: .LBB20_14: ; RV64I-NEXT: li t5, 0 -; RV64I-NEXT: subw t4, t6, t0 +; RV64I-NEXT: sub t4, t6, t0 ; RV64I-NEXT: sll s1, a7, t4 ; RV64I-NEXT: .LBB20_15: ; RV64I-NEXT: sub s0, a6, t1 @@ -10238,13 +10238,13 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; RV64I-NEXT: .LBB20_17: ; RV64I-NEXT: bltu s0, t0, .LBB20_19 ; RV64I-NEXT: # %bb.18: -; RV64I-NEXT: subw t6, s0, t0 +; RV64I-NEXT: sub t6, s0, t0 ; RV64I-NEXT: sra t6, a5, t6 ; RV64I-NEXT: bnez s0, .LBB20_20 ; RV64I-NEXT: j .LBB20_21 ; RV64I-NEXT: .LBB20_19: ; RV64I-NEXT: srl t6, a7, s0 -; RV64I-NEXT: negw s1, s0 +; RV64I-NEXT: neg s1, s0 ; RV64I-NEXT: sll s1, a5, s1 ; RV64I-NEXT: or t6, t6, s1 ; RV64I-NEXT: beqz s0, .LBB20_21 diff --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll index 3fb0f2c..41f73f5 100644 --- a/llvm/test/CodeGen/RISCV/abds-neg.ll +++ b/llvm/test/CodeGen/RISCV/abds-neg.ll @@ -2221,7 +2221,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { ; ; RV64I-LABEL: abd_subnsw_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: sraiw a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a1, a0 @@ -2236,7 +2236,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { ; ; RV64ZBB-LABEL: abd_subnsw_i32: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: sraiw a1, a0, 31 ; RV64ZBB-NEXT: xor a0, a0, a1 ; RV64ZBB-NEXT: subw a0, a1, a0 @@ -2258,7 +2258,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind { ; ; RV64I-LABEL: abd_subnsw_i32_undef: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: sraiw a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a1, a0 @@ -2273,7 +2273,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind { ; ; RV64ZBB-LABEL: abd_subnsw_i32_undef: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: sraiw a1, a0, 31 ; RV64ZBB-NEXT: xor a0, a0, a1 ; RV64ZBB-NEXT: subw a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index efb4e1a..28a95ef 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -1733,21 +1733,13 @@ define i8 @abd_subnsw_i8(i8 %a, i8 %b) nounwind { ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; -; RV32ZBB-LABEL: abd_subnsw_i8: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sub a0, a0, a1 -; RV32ZBB-NEXT: sext.b a0, a0 -; RV32ZBB-NEXT: neg a1, a0 -; RV32ZBB-NEXT: max a0, a0, a1 -; RV32ZBB-NEXT: ret -; -; RV64ZBB-LABEL: abd_subnsw_i8: -; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 -; RV64ZBB-NEXT: sext.b a0, a0 -; RV64ZBB-NEXT: neg a1, a0 -; RV64ZBB-NEXT: max a0, a0, a1 -; RV64ZBB-NEXT: ret +; ZBB-LABEL: abd_subnsw_i8: +; ZBB: # %bb.0: +; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: sext.b a0, a0 +; ZBB-NEXT: neg a1, a0 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: ret %sub = sub nsw i8 %a, %b %abs = call i8 @llvm.abs.i8(i8 %sub, i1 false) ret i8 %abs @@ -1772,21 +1764,13 @@ define i8 @abd_subnsw_i8_undef(i8 %a, i8 %b) nounwind { ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; -; RV32ZBB-LABEL: abd_subnsw_i8_undef: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sub a0, a0, a1 -; RV32ZBB-NEXT: sext.b a0, a0 -; RV32ZBB-NEXT: neg a1, a0 -; RV32ZBB-NEXT: max a0, a0, a1 -; RV32ZBB-NEXT: ret -; -; RV64ZBB-LABEL: abd_subnsw_i8_undef: -; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 -; RV64ZBB-NEXT: sext.b a0, a0 -; RV64ZBB-NEXT: neg a1, a0 -; RV64ZBB-NEXT: max a0, a0, a1 -; RV64ZBB-NEXT: ret +; ZBB-LABEL: abd_subnsw_i8_undef: +; ZBB: # %bb.0: +; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: sext.b a0, a0 +; ZBB-NEXT: neg a1, a0 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: ret %sub = sub nsw i8 %a, %b %abs = call i8 @llvm.abs.i8(i8 %sub, i1 true) ret i8 %abs @@ -1811,21 +1795,13 @@ define i16 @abd_subnsw_i16(i16 %a, i16 %b) nounwind { ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; -; RV32ZBB-LABEL: abd_subnsw_i16: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sub a0, a0, a1 -; RV32ZBB-NEXT: sext.h a0, a0 -; RV32ZBB-NEXT: neg a1, a0 -; RV32ZBB-NEXT: max a0, a0, a1 -; RV32ZBB-NEXT: ret -; -; RV64ZBB-LABEL: abd_subnsw_i16: -; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 -; RV64ZBB-NEXT: sext.h a0, a0 -; RV64ZBB-NEXT: neg a1, a0 -; RV64ZBB-NEXT: max a0, a0, a1 -; RV64ZBB-NEXT: ret +; ZBB-LABEL: abd_subnsw_i16: +; ZBB: # %bb.0: +; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: sext.h a0, a0 +; ZBB-NEXT: neg a1, a0 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: ret %sub = sub nsw i16 %a, %b %abs = call i16 @llvm.abs.i16(i16 %sub, i1 false) ret i16 %abs @@ -1850,21 +1826,13 @@ define i16 @abd_subnsw_i16_undef(i16 %a, i16 %b) nounwind { ; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: ret ; -; RV32ZBB-LABEL: abd_subnsw_i16_undef: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: sub a0, a0, a1 -; RV32ZBB-NEXT: sext.h a0, a0 -; RV32ZBB-NEXT: neg a1, a0 -; RV32ZBB-NEXT: max a0, a0, a1 -; RV32ZBB-NEXT: ret -; -; RV64ZBB-LABEL: abd_subnsw_i16_undef: -; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: subw a0, a0, a1 -; RV64ZBB-NEXT: sext.h a0, a0 -; RV64ZBB-NEXT: neg a1, a0 -; RV64ZBB-NEXT: max a0, a0, a1 -; RV64ZBB-NEXT: ret +; ZBB-LABEL: abd_subnsw_i16_undef: +; ZBB: # %bb.0: +; ZBB-NEXT: sub a0, a0, a1 +; ZBB-NEXT: sext.h a0, a0 +; ZBB-NEXT: neg a1, a0 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: ret %sub = sub nsw i16 %a, %b %abs = call i16 @llvm.abs.i16(i16 %sub, i1 true) ret i16 %abs @@ -1881,7 +1849,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { ; ; RV64I-LABEL: abd_subnsw_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: sraiw a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a0, a1 @@ -1916,7 +1884,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind { ; ; RV64I-LABEL: abd_subnsw_i32_undef: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: sraiw a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a0, a1 @@ -2317,7 +2285,7 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind { ; ; RV64I-LABEL: abd_sub_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: sraiw a1, a0, 31 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: subw a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll index aac355e..3b2cab2 100644 --- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll +++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll @@ -20,7 +20,7 @@ define i32 @add_mul_combine_accept_a1(i32 %x) { ; RV64IMB: # %bb.0: ; RV64IMB-NEXT: sh1add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: addiw a0, a0, 1073 ; RV64IMB-NEXT: ret %tmp0 = add i32 %x, 37 @@ -41,7 +41,7 @@ define signext i32 @add_mul_combine_accept_a2(i32 signext %x) { ; RV64IMB: # %bb.0: ; RV64IMB-NEXT: sh1add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: addiw a0, a0, 1073 ; RV64IMB-NEXT: ret %tmp0 = add i32 %x, 37 @@ -93,7 +93,7 @@ define i32 @add_mul_combine_accept_b1(i32 %x) { ; RV64IMB: # %bb.0: ; RV64IMB-NEXT: sh3add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: lui a1, 50 ; RV64IMB-NEXT: addi a1, a1, 1119 ; RV64IMB-NEXT: addw a0, a0, a1 @@ -118,7 +118,7 @@ define signext i32 @add_mul_combine_accept_b2(i32 signext %x) { ; RV64IMB: # %bb.0: ; RV64IMB-NEXT: sh3add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: lui a1, 50 ; RV64IMB-NEXT: addi a1, a1, 1119 ; RV64IMB-NEXT: addw a0, a0, a1 @@ -456,7 +456,7 @@ define i32 @add_mul_combine_reject_f1(i32 %x) { ; RV64IMB-NEXT: addi a0, a0, 1972 ; RV64IMB-NEXT: sh1add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: addiw a0, a0, 11 ; RV64IMB-NEXT: ret %tmp0 = mul i32 %x, 29 @@ -479,7 +479,7 @@ define signext i32 @add_mul_combine_reject_f2(i32 signext %x) { ; RV64IMB-NEXT: addi a0, a0, 1972 ; RV64IMB-NEXT: sh1add a1, a0, a0 ; RV64IMB-NEXT: slli a0, a0, 5 -; RV64IMB-NEXT: subw a0, a0, a1 +; RV64IMB-NEXT: sub a0, a0, a1 ; RV64IMB-NEXT: addiw a0, a0, 11 ; RV64IMB-NEXT: ret %tmp0 = mul i32 %x, 29 diff --git a/llvm/test/CodeGen/RISCV/aext-to-sext.ll b/llvm/test/CodeGen/RISCV/aext-to-sext.ll index f3f71a9..34549a0 100644 --- a/llvm/test/CodeGen/RISCV/aext-to-sext.ll +++ b/llvm/test/CodeGen/RISCV/aext-to-sext.ll @@ -16,7 +16,7 @@ define void @quux(i32 signext %arg, i32 signext %arg1) nounwind { ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: subw s0, a1, a0 +; RV64I-NEXT: sub s0, a1, a0 ; RV64I-NEXT: .LBB0_2: # %bb2 ; RV64I-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64I-NEXT: call hoge diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll index bebc097..7d29ac9 100644 --- a/llvm/test/CodeGen/RISCV/atomic-signext.ll +++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll @@ -4582,7 +4582,7 @@ define signext i32 @atomicrmw_and_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB56_2: # %else -; RV64I-NEXT: lwu a1, 0(a0) +; RV64I-NEXT: lw a1, 0(a0) ; RV64I-NEXT: andi a2, a1, 1 ; RV64I-NEXT: sw a2, 0(a0) ; RV64I-NEXT: sext.w a0, a1 @@ -4700,7 +4700,7 @@ define signext i32 @atomicrmw_nand_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind ; RV64I-NEXT: sext.w a0, a0 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB57_2: # %else -; RV64I-NEXT: lwu a1, 0(a0) +; RV64I-NEXT: lw a1, 0(a0) ; RV64I-NEXT: andi a2, a1, 1 ; RV64I-NEXT: sw a2, 0(a0) ; RV64I-NEXT: sext.w a0, a1 diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll index 27704d1..ea9786d 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll @@ -161,7 +161,7 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: sltu t0, t0, a5 ; RV64IA-NEXT: addi t0, t0, -1 ; RV64IA-NEXT: and t0, t0, a1 -; RV64IA-NEXT: subw a6, a6, t0 +; RV64IA-NEXT: sub a6, a6, t0 ; RV64IA-NEXT: zext.b a6, a6 ; RV64IA-NEXT: sllw a6, a6, a0 ; RV64IA-NEXT: and a3, a3, a4 @@ -345,7 +345,7 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: sltu t1, t1, a6 ; RV64IA-NEXT: addi t1, t1, -1 ; RV64IA-NEXT: and t1, t1, a1 -; RV64IA-NEXT: subw a7, a7, t1 +; RV64IA-NEXT: sub a7, a7, t1 ; RV64IA-NEXT: and a7, a7, a3 ; RV64IA-NEXT: sllw a7, a7, a0 ; RV64IA-NEXT: and a4, a4, a5 diff --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll index ada1933..4e04f38 100644 --- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll +++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll @@ -150,7 +150,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) { ; RV64IA-NEXT: zext.b a7, a5 ; RV64IA-NEXT: addi a5, a5, 1 ; RV64IA-NEXT: sltu a7, a7, a1 -; RV64IA-NEXT: negw a7, a7 +; RV64IA-NEXT: neg a7, a7 ; RV64IA-NEXT: and a5, a7, a5 ; RV64IA-NEXT: zext.b a5, a5 ; RV64IA-NEXT: sllw a5, a5, a0 @@ -325,7 +325,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) { ; RV64IA-NEXT: addi a6, a6, 1 ; RV64IA-NEXT: sltu t0, t0, a1 ; RV64IA-NEXT: and a6, a6, a3 -; RV64IA-NEXT: negw t0, t0 +; RV64IA-NEXT: neg t0, t0 ; RV64IA-NEXT: and a6, t0, a6 ; RV64IA-NEXT: sllw a6, a6, a0 ; RV64IA-NEXT: and a4, a4, a5 diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll index 3422ea6..6207a17 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll @@ -1074,7 +1074,7 @@ define bfloat @fcvt_bf16_wu_load(ptr %p) nounwind { ; ; CHECK64ZFBFMIN-LABEL: fcvt_bf16_wu_load: ; CHECK64ZFBFMIN: # %bb.0: -; CHECK64ZFBFMIN-NEXT: lwu a0, 0(a0) +; CHECK64ZFBFMIN-NEXT: lw a0, 0(a0) ; CHECK64ZFBFMIN-NEXT: fcvt.s.wu fa5, a0 ; CHECK64ZFBFMIN-NEXT: fcvt.bf16.s fa0, fa5 ; CHECK64ZFBFMIN-NEXT: ret @@ -1083,7 +1083,7 @@ define bfloat @fcvt_bf16_wu_load(ptr %p) nounwind { ; RV64ID: # %bb.0: ; RV64ID-NEXT: addi sp, sp, -16 ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64ID-NEXT: lwu a0, 0(a0) +; RV64ID-NEXT: lw a0, 0(a0) ; RV64ID-NEXT: fcvt.s.wu fa0, a0 ; RV64ID-NEXT: call __truncsfbf2 ; RV64ID-NEXT: fmv.x.w a0, fa0 diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index 72489185..530980c 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -63,7 +63,7 @@ define i8 @test_cttz_i8(i8 %a) nounwind { ; RV64NOZBB-NEXT: and a0, a0, a1 ; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 +; RV64NOZBB-NEXT: sub a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 51 ; RV64NOZBB-NEXT: srli a0, a0, 2 ; RV64NOZBB-NEXT: andi a0, a0, 51 @@ -262,7 +262,7 @@ define i32 @test_cttz_i32(i32 %a) nounwind { ; RV64I-NEXT: sext.w a1, a0 ; RV64I-NEXT: beqz a1, .LBB2_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -270,16 +270,16 @@ define i32 @test_cttz_i32(i32 %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -318,7 +318,7 @@ define i32 @test_cttz_i32(i32 %a) nounwind { ; RV64M-NEXT: sext.w a1, a0 ; RV64M-NEXT: beqz a1, .LBB2_2 ; RV64M-NEXT: # %bb.1: # %cond.false -; RV64M-NEXT: negw a1, a0 +; RV64M-NEXT: neg a1, a0 ; RV64M-NEXT: and a0, a0, a1 ; RV64M-NEXT: lui a1, 30667 ; RV64M-NEXT: addi a1, a1, 1329 @@ -597,7 +597,7 @@ define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind { ; RV64NOZBB-NEXT: and a0, a0, a1 ; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 +; RV64NOZBB-NEXT: sub a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 51 ; RV64NOZBB-NEXT: srli a0, a0, 2 ; RV64NOZBB-NEXT: andi a0, a0, 51 @@ -743,7 +743,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; ; RV64I-LABEL: test_cttz_i32_zero_undef: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -751,16 +751,16 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -788,7 +788,7 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind { ; ; RV64M-LABEL: test_cttz_i32_zero_undef: ; RV64M: # %bb.0: -; RV64M-NEXT: negw a1, a0 +; RV64M-NEXT: neg a1, a0 ; RV64M-NEXT: and a0, a0, a1 ; RV64M-NEXT: lui a1, 30667 ; RV64M-NEXT: addi a1, a1, 1329 @@ -1039,7 +1039,7 @@ define i8 @test_ctlz_i8(i8 %a) nounwind { ; RV64NOZBB-NEXT: not a0, a0 ; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 +; RV64NOZBB-NEXT: sub a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 51 ; RV64NOZBB-NEXT: srli a0, a0, 2 ; RV64NOZBB-NEXT: andi a0, a0, 51 @@ -1711,7 +1711,7 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind { ; RV64NOZBB-NEXT: not a0, a0 ; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 +; RV64NOZBB-NEXT: sub a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 51 ; RV64NOZBB-NEXT: srli a0, a0, 2 ; RV64NOZBB-NEXT: andi a0, a0, 51 @@ -2296,7 +2296,7 @@ define i8 @test_ctpop_i8(i8 %a) nounwind { ; RV64NOZBB: # %bb.0: ; RV64NOZBB-NEXT: srli a1, a0, 1 ; RV64NOZBB-NEXT: andi a1, a1, 85 -; RV64NOZBB-NEXT: subw a0, a0, a1 +; RV64NOZBB-NEXT: sub a0, a0, a1 ; RV64NOZBB-NEXT: andi a1, a0, 51 ; RV64NOZBB-NEXT: srli a0, a0, 2 ; RV64NOZBB-NEXT: andi a0, a0, 51 @@ -2336,7 +2336,7 @@ define i8 @test_ctpop_i8(i8 %a) nounwind { ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srli a1, a0, 1 ; RV64XTHEADBB-NEXT: andi a1, a1, 85 -; RV64XTHEADBB-NEXT: subw a0, a0, a1 +; RV64XTHEADBB-NEXT: sub a0, a0, a1 ; RV64XTHEADBB-NEXT: andi a1, a0, 51 ; RV64XTHEADBB-NEXT: srli a0, a0, 2 ; RV64XTHEADBB-NEXT: andi a0, a0, 51 diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll index 637fb31..a1061fbb 100644 --- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll +++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll @@ -163,7 +163,7 @@ define i64 @ctz_dereferencing_pointer_zext(ptr %b) nounwind { ; RV64I-LABEL: ctz_dereferencing_pointer_zext: ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: lw a0, 0(a0) -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -171,16 +171,16 @@ define i64 @ctz_dereferencing_pointer_zext(ptr %b) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -248,7 +248,7 @@ define signext i32 @ctz1(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz1: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -256,16 +256,16 @@ define signext i32 @ctz1(i32 signext %x) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -331,7 +331,7 @@ define signext i32 @ctz1_flipped(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz1_flipped: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -339,16 +339,16 @@ define signext i32 @ctz1_flipped(i32 signext %x) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -412,7 +412,7 @@ define signext i32 @ctz2(i32 signext %x) nounwind { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: beqz a0, .LBB4_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -420,16 +420,16 @@ define signext i32 @ctz2(i32 signext %x) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -490,7 +490,7 @@ define signext i32 @ctz3(i32 signext %x) nounwind { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -498,16 +498,16 @@ define signext i32 @ctz3(i32 signext %x) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -824,7 +824,7 @@ define signext i32 @ctz5(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz5: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -832,16 +832,16 @@ define signext i32 @ctz5(i32 signext %x) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -907,7 +907,7 @@ define signext i32 @ctz6(i32 signext %x) nounwind { ; ; RV64I-LABEL: ctz6: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -915,16 +915,16 @@ define signext i32 @ctz6(i32 signext %x) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -997,7 +997,7 @@ define signext i32 @globalVar() nounwind { ; RV64I: # %bb.0: # %entry ; RV64I-NEXT: lui a0, %hi(global_x) ; RV64I-NEXT: lw a0, %lo(global_x)(a0) -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -1005,16 +1005,16 @@ define signext i32 @globalVar() nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll index ea8b04d..53c3f58 100644 --- a/llvm/test/CodeGen/RISCV/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll @@ -54,7 +54,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind { ; RV64IM-NEXT: slli a2, a2, 32 ; RV64IM-NEXT: mulhu a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 32 -; RV64IM-NEXT: subw a0, a0, a1 +; RV64IM-NEXT: sub a0, a0, a1 ; RV64IM-NEXT: srliw a0, a0, 1 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: srli a0, a0, 2 @@ -67,7 +67,7 @@ define i32 @udiv_constant_add(i32 %a) nounwind { ; RV64IMZB-NEXT: addi a2, a2, -1755 ; RV64IMZB-NEXT: mul a1, a1, a2 ; RV64IMZB-NEXT: srli a1, a1, 32 -; RV64IMZB-NEXT: subw a0, a0, a1 +; RV64IMZB-NEXT: sub a0, a0, a1 ; RV64IMZB-NEXT: srliw a0, a0, 1 ; RV64IMZB-NEXT: add a0, a0, a1 ; RV64IMZB-NEXT: srli a0, a0, 2 @@ -193,7 +193,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind { ; RV64IM-NEXT: li a2, 37 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 8 -; RV64IM-NEXT: subw a0, a0, a1 +; RV64IM-NEXT: sub a0, a0, a1 ; RV64IM-NEXT: slli a0, a0, 56 ; RV64IM-NEXT: srli a0, a0, 57 ; RV64IM-NEXT: add a0, a0, a1 @@ -206,7 +206,7 @@ define i8 @udiv8_constant_add(i8 %a) nounwind { ; RV64IMZB-NEXT: sh3add a2, a1, a1 ; RV64IMZB-NEXT: sh2add a1, a2, a1 ; RV64IMZB-NEXT: srli a1, a1, 8 -; RV64IMZB-NEXT: subw a0, a0, a1 +; RV64IMZB-NEXT: sub a0, a0, a1 ; RV64IMZB-NEXT: slli a0, a0, 56 ; RV64IMZB-NEXT: srli a0, a0, 57 ; RV64IMZB-NEXT: add a0, a0, a1 @@ -257,7 +257,7 @@ define i16 @udiv16_constant_add(i16 %a) nounwind { ; RV64-NEXT: lui a2, 149808 ; RV64-NEXT: mulhu a1, a1, a2 ; RV64-NEXT: srli a1, a1, 16 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 49 ; RV64-NEXT: add a0, a0, a1 @@ -367,7 +367,7 @@ define i32 @sdiv_constant_sub_srai(i32 %a) nounwind { ; RV64-NEXT: addi a2, a2, -1171 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: subw a1, a1, a0 +; RV64-NEXT: sub a1, a1, a0 ; RV64-NEXT: srliw a0, a1, 31 ; RV64-NEXT: sraiw a1, a1, 2 ; RV64-NEXT: add a0, a1, a0 @@ -666,7 +666,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { ; RV64IM-NEXT: srai a1, a1, 56 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 8 -; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: sub a1, a1, a0 ; RV64IM-NEXT: slli a1, a1, 56 ; RV64IM-NEXT: srli a0, a1, 63 ; RV64IM-NEXT: srai a1, a1, 58 @@ -679,7 +679,7 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind { ; RV64IMZB-NEXT: li a2, 109 ; RV64IMZB-NEXT: mul a1, a1, a2 ; RV64IMZB-NEXT: srli a1, a1, 8 -; RV64IMZB-NEXT: subw a1, a1, a0 +; RV64IMZB-NEXT: sub a1, a1, a0 ; RV64IMZB-NEXT: slli a1, a1, 56 ; RV64IMZB-NEXT: srli a0, a1, 63 ; RV64IMZB-NEXT: srai a1, a1, 58 @@ -889,7 +889,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { ; RV64IM-NEXT: addi a2, a2, 1911 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 16 -; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: sub a1, a1, a0 ; RV64IM-NEXT: slli a1, a1, 48 ; RV64IM-NEXT: srli a0, a1, 63 ; RV64IM-NEXT: srai a1, a1, 51 @@ -903,7 +903,7 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind { ; RV64IMZB-NEXT: addi a2, a2, 1911 ; RV64IMZB-NEXT: mul a1, a1, a2 ; RV64IMZB-NEXT: srli a1, a1, 16 -; RV64IMZB-NEXT: subw a1, a1, a0 +; RV64IMZB-NEXT: sub a1, a1, a0 ; RV64IMZB-NEXT: slli a1, a1, 48 ; RV64IMZB-NEXT: srli a0, a1, 63 ; RV64IMZB-NEXT: srai a1, a1, 51 diff --git a/llvm/test/CodeGen/RISCV/double-convert-strict.ll b/llvm/test/CodeGen/RISCV/double-convert-strict.ll index 2b1ec10..9a5e357 100644 --- a/llvm/test/CodeGen/RISCV/double-convert-strict.ll +++ b/llvm/test/CodeGen/RISCV/double-convert-strict.ll @@ -347,17 +347,11 @@ define double @fcvt_d_wu(i32 %a) nounwind strictfp { declare double @llvm.experimental.constrained.uitofp.f64.i32(i32, metadata, metadata) define double @fcvt_d_wu_load(ptr %p) nounwind strictfp { -; RV32IFD-LABEL: fcvt_d_wu_load: -; RV32IFD: # %bb.0: -; RV32IFD-NEXT: lw a0, 0(a0) -; RV32IFD-NEXT: fcvt.d.wu fa0, a0 -; RV32IFD-NEXT: ret -; -; RV64IFD-LABEL: fcvt_d_wu_load: -; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lwu a0, 0(a0) -; RV64IFD-NEXT: fcvt.d.wu fa0, a0 -; RV64IFD-NEXT: ret +; CHECKIFD-LABEL: fcvt_d_wu_load: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: lw a0, 0(a0) +; CHECKIFD-NEXT: fcvt.d.wu fa0, a0 +; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcvt_d_wu_load: ; RV32IZFINXZDINX: # %bb.0: @@ -367,7 +361,7 @@ define double @fcvt_d_wu_load(ptr %p) nounwind strictfp { ; ; RV64IZFINXZDINX-LABEL: fcvt_d_wu_load: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lwu a0, 0(a0) +; RV64IZFINXZDINX-NEXT: lw a0, 0(a0) ; RV64IZFINXZDINX-NEXT: fcvt.d.wu a0, a0 ; RV64IZFINXZDINX-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll index fad9e21..a2e6186 100644 --- a/llvm/test/CodeGen/RISCV/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/double-convert.ll @@ -582,17 +582,11 @@ define double @fcvt_d_wu(i32 %a) nounwind { } define double @fcvt_d_wu_load(ptr %p) nounwind { -; RV32IFD-LABEL: fcvt_d_wu_load: -; RV32IFD: # %bb.0: -; RV32IFD-NEXT: lw a0, 0(a0) -; RV32IFD-NEXT: fcvt.d.wu fa0, a0 -; RV32IFD-NEXT: ret -; -; RV64IFD-LABEL: fcvt_d_wu_load: -; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lwu a0, 0(a0) -; RV64IFD-NEXT: fcvt.d.wu fa0, a0 -; RV64IFD-NEXT: ret +; CHECKIFD-LABEL: fcvt_d_wu_load: +; CHECKIFD: # %bb.0: +; CHECKIFD-NEXT: lw a0, 0(a0) +; CHECKIFD-NEXT: fcvt.d.wu fa0, a0 +; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcvt_d_wu_load: ; RV32IZFINXZDINX: # %bb.0: @@ -602,7 +596,7 @@ define double @fcvt_d_wu_load(ptr %p) nounwind { ; ; RV64IZFINXZDINX-LABEL: fcvt_d_wu_load: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: lwu a0, 0(a0) +; RV64IZFINXZDINX-NEXT: lw a0, 0(a0) ; RV64IZFINXZDINX-NEXT: fcvt.d.wu a0, a0 ; RV64IZFINXZDINX-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/float-convert-strict.ll b/llvm/test/CodeGen/RISCV/float-convert-strict.ll index 0c265e1..1b25a2b 100644 --- a/llvm/test/CodeGen/RISCV/float-convert-strict.ll +++ b/llvm/test/CodeGen/RISCV/float-convert-strict.ll @@ -236,29 +236,17 @@ define float @fcvt_s_wu(i32 %a) nounwind strictfp { declare float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata, metadata) define float @fcvt_s_wu_load(ptr %p) nounwind strictfp { -; RV32IF-LABEL: fcvt_s_wu_load: -; RV32IF: # %bb.0: -; RV32IF-NEXT: lw a0, 0(a0) -; RV32IF-NEXT: fcvt.s.wu fa0, a0 -; RV32IF-NEXT: ret -; -; RV64IF-LABEL: fcvt_s_wu_load: -; RV64IF: # %bb.0: -; RV64IF-NEXT: lwu a0, 0(a0) -; RV64IF-NEXT: fcvt.s.wu fa0, a0 -; RV64IF-NEXT: ret -; -; RV32IZFINX-LABEL: fcvt_s_wu_load: -; RV32IZFINX: # %bb.0: -; RV32IZFINX-NEXT: lw a0, 0(a0) -; RV32IZFINX-NEXT: fcvt.s.wu a0, a0 -; RV32IZFINX-NEXT: ret +; CHECKIF-LABEL: fcvt_s_wu_load: +; CHECKIF: # %bb.0: +; CHECKIF-NEXT: lw a0, 0(a0) +; CHECKIF-NEXT: fcvt.s.wu fa0, a0 +; CHECKIF-NEXT: ret ; -; RV64IZFINX-LABEL: fcvt_s_wu_load: -; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: lwu a0, 0(a0) -; RV64IZFINX-NEXT: fcvt.s.wu a0, a0 -; RV64IZFINX-NEXT: ret +; CHECKIZFINX-LABEL: fcvt_s_wu_load: +; CHECKIZFINX: # %bb.0: +; CHECKIZFINX-NEXT: lw a0, 0(a0) +; CHECKIZFINX-NEXT: fcvt.s.wu a0, a0 +; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcvt_s_wu_load: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll index 1cb7b27..60349a0 100644 --- a/llvm/test/CodeGen/RISCV/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/float-convert.ll @@ -482,29 +482,17 @@ define float @fcvt_s_wu(i32 %a) nounwind { } define float @fcvt_s_wu_load(ptr %p) nounwind { -; RV32IF-LABEL: fcvt_s_wu_load: -; RV32IF: # %bb.0: -; RV32IF-NEXT: lw a0, 0(a0) -; RV32IF-NEXT: fcvt.s.wu fa0, a0 -; RV32IF-NEXT: ret -; -; RV64IF-LABEL: fcvt_s_wu_load: -; RV64IF: # %bb.0: -; RV64IF-NEXT: lwu a0, 0(a0) -; RV64IF-NEXT: fcvt.s.wu fa0, a0 -; RV64IF-NEXT: ret -; -; RV32IZFINX-LABEL: fcvt_s_wu_load: -; RV32IZFINX: # %bb.0: -; RV32IZFINX-NEXT: lw a0, 0(a0) -; RV32IZFINX-NEXT: fcvt.s.wu a0, a0 -; RV32IZFINX-NEXT: ret +; CHECKIF-LABEL: fcvt_s_wu_load: +; CHECKIF: # %bb.0: +; CHECKIF-NEXT: lw a0, 0(a0) +; CHECKIF-NEXT: fcvt.s.wu fa0, a0 +; CHECKIF-NEXT: ret ; -; RV64IZFINX-LABEL: fcvt_s_wu_load: -; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: lwu a0, 0(a0) -; RV64IZFINX-NEXT: fcvt.s.wu a0, a0 -; RV64IZFINX-NEXT: ret +; CHECKIZFINX-LABEL: fcvt_s_wu_load: +; CHECKIZFINX: # %bb.0: +; CHECKIZFINX-NEXT: lw a0, 0(a0) +; CHECKIZFINX-NEXT: fcvt.s.wu a0, a0 +; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcvt_s_wu_load: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll index 246e6a6..117e3e4 100644 --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti -; RV32IF-NEXT: lw a0, 8(sp) -; RV32IF-NEXT: lw a1, 12(sp) -; RV32IF-NEXT: lw a2, 20(sp) +; RV32IF-NEXT: lw a0, 20(sp) +; RV32IF-NEXT: lw a1, 8(sp) +; RV32IF-NEXT: lw a2, 12(sp) ; RV32IF-NEXT: lw a3, 16(sp) -; RV32IF-NEXT: beqz a2, .LBB47_2 +; RV32IF-NEXT: beqz a0, .LBB47_2 ; RV32IF-NEXT: # %bb.1: # %entry -; RV32IF-NEXT: slti a4, a2, 0 +; RV32IF-NEXT: slti a4, a0, 0 ; RV32IF-NEXT: j .LBB47_3 ; RV32IF-NEXT: .LBB47_2: ; RV32IF-NEXT: seqz a4, a3 ; RV32IF-NEXT: .LBB47_3: # %entry ; RV32IF-NEXT: xori a3, a3, 1 -; RV32IF-NEXT: or a3, a3, a2 +; RV32IF-NEXT: or a3, a3, a0 ; RV32IF-NEXT: seqz a3, a3 ; RV32IF-NEXT: addi a3, a3, -1 ; RV32IF-NEXT: and a3, a3, a4 ; RV32IF-NEXT: neg a3, a3 +; RV32IF-NEXT: and a2, a3, a2 ; RV32IF-NEXT: and a1, a3, a1 ; RV32IF-NEXT: and a0, a3, a0 -; RV32IF-NEXT: and a2, a3, a2 -; RV32IF-NEXT: slti a2, a2, 0 -; RV32IF-NEXT: addi a2, a2, -1 -; RV32IF-NEXT: and a0, a2, a0 -; RV32IF-NEXT: and a1, a2, a1 +; RV32IF-NEXT: slti a0, a0, 0 +; RV32IF-NEXT: addi a3, a0, -1 +; RV32IF-NEXT: and a0, a3, a1 +; RV32IF-NEXT: and a1, a3, a2 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti -; RV32IFD-NEXT: lw a0, 8(sp) -; RV32IFD-NEXT: lw a1, 12(sp) -; RV32IFD-NEXT: lw a2, 20(sp) +; RV32IFD-NEXT: lw a0, 20(sp) +; RV32IFD-NEXT: lw a1, 8(sp) +; RV32IFD-NEXT: lw a2, 12(sp) ; RV32IFD-NEXT: lw a3, 16(sp) -; RV32IFD-NEXT: beqz a2, .LBB47_2 +; RV32IFD-NEXT: beqz a0, .LBB47_2 ; RV32IFD-NEXT: # %bb.1: # %entry -; RV32IFD-NEXT: slti a4, a2, 0 +; RV32IFD-NEXT: slti a4, a0, 0 ; RV32IFD-NEXT: j .LBB47_3 ; RV32IFD-NEXT: .LBB47_2: ; RV32IFD-NEXT: seqz a4, a3 ; RV32IFD-NEXT: .LBB47_3: # %entry ; RV32IFD-NEXT: xori a3, a3, 1 -; RV32IFD-NEXT: or a3, a3, a2 +; RV32IFD-NEXT: or a3, a3, a0 ; RV32IFD-NEXT: seqz a3, a3 ; RV32IFD-NEXT: addi a3, a3, -1 ; RV32IFD-NEXT: and a3, a3, a4 ; RV32IFD-NEXT: neg a3, a3 +; RV32IFD-NEXT: and a2, a3, a2 ; RV32IFD-NEXT: and a1, a3, a1 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: and a2, a3, a2 -; RV32IFD-NEXT: slti a2, a2, 0 -; RV32IFD-NEXT: addi a2, a2, -1 -; RV32IFD-NEXT: and a0, a2, a0 -; RV32IFD-NEXT: and a1, a2, a1 +; RV32IFD-NEXT: slti a0, a0, 0 +; RV32IFD-NEXT: addi a3, a0, -1 +; RV32IFD-NEXT: and a0, a3, a1 +; RV32IFD-NEXT: and a1, a3, a2 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a0, 8(sp) -; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a2, 20(sp) +; RV32-NEXT: lw a0, 20(sp) +; RV32-NEXT: lw a1, 8(sp) +; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a2, .LBB50_2 +; RV32-NEXT: beqz a0, .LBB50_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a2, 0 +; RV32-NEXT: slti a4, a0, 0 ; RV32-NEXT: j .LBB50_3 ; RV32-NEXT: .LBB50_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB50_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: or a3, a3, a0 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 +; RV32-NEXT: and a2, a3, a2 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: and a2, a3, a2 -; RV32-NEXT: slti a2, a2, 0 -; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: and a0, a3, a1 +; RV32-NEXT: and a1, a3, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) { ; RV32-NEXT: call __extendhfsf2 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a0, 8(sp) -; RV32-NEXT: lw a1, 12(sp) -; RV32-NEXT: lw a2, 20(sp) +; RV32-NEXT: lw a0, 20(sp) +; RV32-NEXT: lw a1, 8(sp) +; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a2, .LBB53_2 +; RV32-NEXT: beqz a0, .LBB53_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a2, 0 +; RV32-NEXT: slti a4, a0, 0 ; RV32-NEXT: j .LBB53_3 ; RV32-NEXT: .LBB53_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB53_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a2 +; RV32-NEXT: or a3, a3, a0 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 +; RV32-NEXT: and a2, a3, a2 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: and a2, a3, a2 -; RV32-NEXT: slti a2, a2, 0 -; RV32-NEXT: addi a2, a2, -1 -; RV32-NEXT: and a0, a2, a0 -; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: addi a3, a0, -1 +; RV32-NEXT: and a0, a3, a1 +; RV32-NEXT: and a1, a3, a2 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 diff --git a/llvm/test/CodeGen/RISCV/half-convert-strict.ll b/llvm/test/CodeGen/RISCV/half-convert-strict.ll index 0a04d44..675e230 100644 --- a/llvm/test/CodeGen/RISCV/half-convert-strict.ll +++ b/llvm/test/CodeGen/RISCV/half-convert-strict.ll @@ -1461,29 +1461,17 @@ define half @fcvt_h_wu(i32 %a) nounwind strictfp { declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata) define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { -; RV32IZFH-LABEL: fcvt_h_wu_load: -; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lw a0, 0(a0) -; RV32IZFH-NEXT: fcvt.h.wu fa0, a0 -; RV32IZFH-NEXT: ret -; -; RV64IZFH-LABEL: fcvt_h_wu_load: -; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: lwu a0, 0(a0) -; RV64IZFH-NEXT: fcvt.h.wu fa0, a0 -; RV64IZFH-NEXT: ret -; -; RV32IZHINX-LABEL: fcvt_h_wu_load: -; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lw a0, 0(a0) -; RV32IZHINX-NEXT: fcvt.h.wu a0, a0 -; RV32IZHINX-NEXT: ret +; CHECKIZFH-LABEL: fcvt_h_wu_load: +; CHECKIZFH: # %bb.0: +; CHECKIZFH-NEXT: lw a0, 0(a0) +; CHECKIZFH-NEXT: fcvt.h.wu fa0, a0 +; CHECKIZFH-NEXT: ret ; -; RV64IZHINX-LABEL: fcvt_h_wu_load: -; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lwu a0, 0(a0) -; RV64IZHINX-NEXT: fcvt.h.wu a0, a0 -; RV64IZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fcvt_h_wu_load: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: lw a0, 0(a0) +; CHECKIZHINX-NEXT: fcvt.h.wu a0, a0 +; CHECKIZHINX-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_h_wu_load: ; RV32IDZFH: # %bb.0: @@ -1493,7 +1481,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { ; ; RV64IDZFH-LABEL: fcvt_h_wu_load: ; RV64IDZFH: # %bb.0: -; RV64IDZFH-NEXT: lwu a0, 0(a0) +; RV64IDZFH-NEXT: lw a0, 0(a0) ; RV64IDZFH-NEXT: fcvt.h.wu fa0, a0 ; RV64IDZFH-NEXT: ret ; @@ -1505,7 +1493,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { ; ; RV64IZDINXZHINX-LABEL: fcvt_h_wu_load: ; RV64IZDINXZHINX: # %bb.0: -; RV64IZDINXZHINX-NEXT: lwu a0, 0(a0) +; RV64IZDINXZHINX-NEXT: lw a0, 0(a0) ; RV64IZDINXZHINX-NEXT: fcvt.h.wu a0, a0 ; RV64IZDINXZHINX-NEXT: ret ; @@ -1518,7 +1506,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { ; ; CHECK64-IZFHMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZFHMIN: # %bb.0: -; CHECK64-IZFHMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZFHMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZFHMIN-NEXT: fcvt.s.wu fa5, a0 ; CHECK64-IZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECK64-IZFHMIN-NEXT: ret @@ -1532,7 +1520,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { ; ; CHECK64-IZHINXMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZHINXMIN: # %bb.0: -; CHECK64-IZHINXMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZHINXMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZHINXMIN-NEXT: fcvt.s.wu a0, a0 ; CHECK64-IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECK64-IZHINXMIN-NEXT: ret @@ -1546,7 +1534,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind strictfp { ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZDINXZHINXMIN: # %bb.0: -; CHECK64-IZDINXZHINXMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZDINXZHINXMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.wu a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll index c53237e..facb544 100644 --- a/llvm/test/CodeGen/RISCV/half-convert.ll +++ b/llvm/test/CodeGen/RISCV/half-convert.ll @@ -4388,17 +4388,11 @@ define half @fcvt_h_wu(i32 %a) nounwind { } define half @fcvt_h_wu_load(ptr %p) nounwind { -; RV32IZFH-LABEL: fcvt_h_wu_load: -; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lw a0, 0(a0) -; RV32IZFH-NEXT: fcvt.h.wu fa0, a0 -; RV32IZFH-NEXT: ret -; -; RV64IZFH-LABEL: fcvt_h_wu_load: -; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: lwu a0, 0(a0) -; RV64IZFH-NEXT: fcvt.h.wu fa0, a0 -; RV64IZFH-NEXT: ret +; CHECKIZFH-LABEL: fcvt_h_wu_load: +; CHECKIZFH: # %bb.0: +; CHECKIZFH-NEXT: lw a0, 0(a0) +; CHECKIZFH-NEXT: fcvt.h.wu fa0, a0 +; CHECKIZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_h_wu_load: ; RV32IDZFH: # %bb.0: @@ -4408,33 +4402,21 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; ; RV64IDZFH-LABEL: fcvt_h_wu_load: ; RV64IDZFH: # %bb.0: -; RV64IDZFH-NEXT: lwu a0, 0(a0) +; RV64IDZFH-NEXT: lw a0, 0(a0) ; RV64IDZFH-NEXT: fcvt.h.wu fa0, a0 ; RV64IDZFH-NEXT: ret ; -; RV32IZHINX-LABEL: fcvt_h_wu_load: -; RV32IZHINX: # %bb.0: -; RV32IZHINX-NEXT: lw a0, 0(a0) -; RV32IZHINX-NEXT: fcvt.h.wu a0, a0 -; RV32IZHINX-NEXT: ret -; -; RV64IZHINX-LABEL: fcvt_h_wu_load: -; RV64IZHINX: # %bb.0: -; RV64IZHINX-NEXT: lwu a0, 0(a0) -; RV64IZHINX-NEXT: fcvt.h.wu a0, a0 -; RV64IZHINX-NEXT: ret -; -; RV32IZDINXZHINX-LABEL: fcvt_h_wu_load: -; RV32IZDINXZHINX: # %bb.0: -; RV32IZDINXZHINX-NEXT: lw a0, 0(a0) -; RV32IZDINXZHINX-NEXT: fcvt.h.wu a0, a0 -; RV32IZDINXZHINX-NEXT: ret +; CHECKIZHINX-LABEL: fcvt_h_wu_load: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: lw a0, 0(a0) +; CHECKIZHINX-NEXT: fcvt.h.wu a0, a0 +; CHECKIZHINX-NEXT: ret ; -; RV64IZDINXZHINX-LABEL: fcvt_h_wu_load: -; RV64IZDINXZHINX: # %bb.0: -; RV64IZDINXZHINX-NEXT: lwu a0, 0(a0) -; RV64IZDINXZHINX-NEXT: fcvt.h.wu a0, a0 -; RV64IZDINXZHINX-NEXT: ret +; CHECKIZDINXZHINX-LABEL: fcvt_h_wu_load: +; CHECKIZDINXZHINX: # %bb.0: +; CHECKIZDINXZHINX-NEXT: lw a0, 0(a0) +; CHECKIZDINXZHINX-NEXT: fcvt.h.wu a0, a0 +; CHECKIZDINXZHINX-NEXT: ret ; ; RV32I-LABEL: fcvt_h_wu_load: ; RV32I: # %bb.0: @@ -4476,7 +4458,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; RV64ID-LP64: # %bb.0: ; RV64ID-LP64-NEXT: addi sp, sp, -16 ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64ID-LP64-NEXT: lwu a0, 0(a0) +; RV64ID-LP64-NEXT: lw a0, 0(a0) ; RV64ID-LP64-NEXT: fcvt.s.wu fa5, a0 ; RV64ID-LP64-NEXT: fmv.x.w a0, fa5 ; RV64ID-LP64-NEXT: call __truncsfhf2 @@ -4505,7 +4487,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; RV64ID: # %bb.0: ; RV64ID-NEXT: addi sp, sp, -16 ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64ID-NEXT: lwu a0, 0(a0) +; RV64ID-NEXT: lw a0, 0(a0) ; RV64ID-NEXT: fcvt.s.wu fa0, a0 ; RV64ID-NEXT: call __truncsfhf2 ; RV64ID-NEXT: fmv.x.w a0, fa0 @@ -4525,7 +4507,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; ; CHECK64-IZFHMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZFHMIN: # %bb.0: -; CHECK64-IZFHMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZFHMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZFHMIN-NEXT: fcvt.s.wu fa5, a0 ; CHECK64-IZFHMIN-NEXT: fcvt.h.s fa0, fa5 ; CHECK64-IZFHMIN-NEXT: ret @@ -4539,7 +4521,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; ; CHECK64-IZHINXMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZHINXMIN: # %bb.0: -; CHECK64-IZHINXMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZHINXMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZHINXMIN-NEXT: fcvt.s.wu a0, a0 ; CHECK64-IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECK64-IZHINXMIN-NEXT: ret @@ -4553,7 +4535,7 @@ define half @fcvt_h_wu_load(ptr %p) nounwind { ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_h_wu_load: ; CHECK64-IZDINXZHINXMIN: # %bb.0: -; CHECK64-IZDINXZHINXMIN-NEXT: lwu a0, 0(a0) +; CHECK64-IZDINXZHINXMIN-NEXT: lw a0, 0(a0) ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.wu a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll index 66cde32..774f1a1 100644 --- a/llvm/test/CodeGen/RISCV/iabs.ll +++ b/llvm/test/CodeGen/RISCV/iabs.ll @@ -651,7 +651,7 @@ define void @zext16_abs8(i8 %x, ptr %p) { ; RV64I-NEXT: srai a2, a0, 63 ; RV64I-NEXT: srai a0, a0, 56 ; RV64I-NEXT: xor a0, a0, a2 -; RV64I-NEXT: subw a0, a0, a2 +; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: sh a0, 0(a1) ; RV64I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/interrupt-attr.ll b/llvm/test/CodeGen/RISCV/interrupt-attr.ll index e278b8d..472b903 100644 --- a/llvm/test/CodeGen/RISCV/interrupt-attr.ll +++ b/llvm/test/CodeGen/RISCV/interrupt-attr.ll @@ -794,498 +794,46 @@ define void @foo_with_call() #1 { ; CHECK-RV32-V-NEXT: slli a0, a0, 5 ; CHECK-RV32-V-NEXT: sub sp, sp, a0 ; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 5 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 3 ; CHECK-RV32-V-NEXT: mv a1, a0 ; CHECK-RV32-V-NEXT: slli a0, a0, 1 ; CHECK-RV32-V-NEXT: add a0, a0, a1 ; CHECK-RV32-V-NEXT: add a0, sp, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 4 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-V-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 4 ; CHECK-RV32-V-NEXT: add a0, sp, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 4 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-V-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 3 ; CHECK-RV32-V-NEXT: add a0, sp, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 3 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-V-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-V-NEXT: addi a0, sp, 16 -; CHECK-RV32-V-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-V-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-V-NEXT: call otherfoo ; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 5 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 ; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: mv a1, a0 ; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 3 ; CHECK-RV32-V-NEXT: add a0, a0, a1 ; CHECK-RV32-V-NEXT: add a0, sp, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 4 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-V-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 4 ; CHECK-RV32-V-NEXT: add a0, sp, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 4 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 3 ; CHECK-RV32-V-NEXT: add a0, sp, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 3 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: add a0, sp, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, 16 -; CHECK-RV32-V-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-V-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-V-NEXT: addi a0, sp, 16 -; CHECK-RV32-V-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-V-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 5 ; CHECK-RV32-V-NEXT: add sp, sp, a0 @@ -1351,498 +899,46 @@ define void @foo_with_call() #1 { ; CHECK-RV32-FV-NEXT: slli a0, a0, 5 ; CHECK-RV32-FV-NEXT: sub sp, sp, a0 ; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 5 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FV-NEXT: mv a1, a0 ; CHECK-RV32-FV-NEXT: slli a0, a0, 1 ; CHECK-RV32-FV-NEXT: add a0, a0, a1 ; CHECK-RV32-FV-NEXT: add a0, sp, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 4 ; CHECK-RV32-FV-NEXT: add a0, sp, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FV-NEXT: add a0, sp, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FV-NEXT: addi a0, sp, 16 -; CHECK-RV32-FV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FV-NEXT: call otherfoo ; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 5 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FV-NEXT: mv a1, a0 ; CHECK-RV32-FV-NEXT: slli a0, a0, 1 ; CHECK-RV32-FV-NEXT: add a0, a0, a1 ; CHECK-RV32-FV-NEXT: add a0, sp, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 4 ; CHECK-RV32-FV-NEXT: add a0, sp, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FV-NEXT: add a0, sp, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: add a0, sp, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FV-NEXT: addi a0, sp, 16 -; CHECK-RV32-FV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 5 ; CHECK-RV32-FV-NEXT: add sp, sp, a0 @@ -1928,498 +1024,46 @@ define void @foo_with_call() #1 { ; CHECK-RV32-FDV-NEXT: slli a0, a0, 5 ; CHECK-RV32-FDV-NEXT: sub sp, sp, a0 ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 5 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 ; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: mv a1, a0 ; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FDV-NEXT: add a0, a0, a1 ; CHECK-RV32-FDV-NEXT: add a0, sp, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FDV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 4 ; CHECK-RV32-FDV-NEXT: add a0, sp, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FDV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FDV-NEXT: add a0, sp, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FDV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FDV-NEXT: addi a0, sp, 16 -; CHECK-RV32-FDV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FDV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FDV-NEXT: call otherfoo ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 5 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FDV-NEXT: mv a1, a0 ; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 ; CHECK-RV32-FDV-NEXT: add a0, a0, a1 ; CHECK-RV32-FDV-NEXT: add a0, sp, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FDV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 4 ; CHECK-RV32-FDV-NEXT: add a0, sp, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FDV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FDV-NEXT: add a0, sp, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: add a0, sp, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FDV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FDV-NEXT: addi a0, sp, 16 -; CHECK-RV32-FDV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FDV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 5 ; CHECK-RV32-FDV-NEXT: add sp, sp, a0 @@ -3259,498 +1903,46 @@ define void @foo_with_call() #1 { ; CHECK-RV64-V-NEXT: slli a0, a0, 5 ; CHECK-RV64-V-NEXT: sub sp, sp, a0 ; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 5 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 3 ; CHECK-RV64-V-NEXT: mv a1, a0 ; CHECK-RV64-V-NEXT: slli a0, a0, 1 ; CHECK-RV64-V-NEXT: add a0, a0, a1 ; CHECK-RV64-V-NEXT: add a0, sp, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 4 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-V-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 4 ; CHECK-RV64-V-NEXT: add a0, sp, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 4 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-V-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 3 ; CHECK-RV64-V-NEXT: add a0, sp, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 3 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-V-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-V-NEXT: addi a0, sp, 16 -; CHECK-RV64-V-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-V-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-V-NEXT: call otherfoo ; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 5 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 ; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: mv a1, a0 ; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 3 ; CHECK-RV64-V-NEXT: add a0, a0, a1 ; CHECK-RV64-V-NEXT: add a0, sp, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 4 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-V-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 4 ; CHECK-RV64-V-NEXT: add a0, sp, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 4 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 3 ; CHECK-RV64-V-NEXT: add a0, sp, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 3 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: add a0, sp, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, 16 -; CHECK-RV64-V-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-V-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-V-NEXT: addi a0, sp, 16 -; CHECK-RV64-V-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-V-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 5 ; CHECK-RV64-V-NEXT: add sp, sp, a0 @@ -3816,498 +2008,46 @@ define void @foo_with_call() #1 { ; CHECK-RV64-FV-NEXT: slli a0, a0, 5 ; CHECK-RV64-FV-NEXT: sub sp, sp, a0 ; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 5 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FV-NEXT: mv a1, a0 ; CHECK-RV64-FV-NEXT: slli a0, a0, 1 ; CHECK-RV64-FV-NEXT: add a0, a0, a1 ; CHECK-RV64-FV-NEXT: add a0, sp, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 4 ; CHECK-RV64-FV-NEXT: add a0, sp, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FV-NEXT: add a0, sp, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FV-NEXT: addi a0, sp, 16 -; CHECK-RV64-FV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FV-NEXT: call otherfoo ; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 5 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FV-NEXT: mv a1, a0 ; CHECK-RV64-FV-NEXT: slli a0, a0, 1 ; CHECK-RV64-FV-NEXT: add a0, a0, a1 ; CHECK-RV64-FV-NEXT: add a0, sp, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 4 ; CHECK-RV64-FV-NEXT: add a0, sp, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FV-NEXT: add a0, sp, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: add a0, sp, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FV-NEXT: addi a0, sp, 16 -; CHECK-RV64-FV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 5 ; CHECK-RV64-FV-NEXT: add sp, sp, a0 @@ -4393,498 +2133,46 @@ define void @foo_with_call() #1 { ; CHECK-RV64-FDV-NEXT: slli a0, a0, 5 ; CHECK-RV64-FDV-NEXT: sub sp, sp, a0 ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 5 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 ; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: mv a1, a0 ; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FDV-NEXT: add a0, a0, a1 ; CHECK-RV64-FDV-NEXT: add a0, sp, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FDV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 4 ; CHECK-RV64-FDV-NEXT: add a0, sp, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FDV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FDV-NEXT: add a0, sp, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FDV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FDV-NEXT: addi a0, sp, 16 -; CHECK-RV64-FDV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FDV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FDV-NEXT: call otherfoo ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 5 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FDV-NEXT: mv a1, a0 ; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 ; CHECK-RV64-FDV-NEXT: add a0, a0, a1 ; CHECK-RV64-FDV-NEXT: add a0, sp, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FDV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 4 ; CHECK-RV64-FDV-NEXT: add a0, sp, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FDV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FDV-NEXT: add a0, sp, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: add a0, sp, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FDV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FDV-NEXT: addi a0, sp, 16 -; CHECK-RV64-FDV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FDV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 5 ; CHECK-RV64-FDV-NEXT: add sp, sp, a0 @@ -5670,422 +2958,39 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV32-V-NEXT: slli a0, a0, 5 ; CHECK-RV32-V-NEXT: sub sp, sp, a0 ; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 3 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 3 ; CHECK-RV32-V-NEXT: sub a0, s0, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 4 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-V-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 4 ; CHECK-RV32-V-NEXT: sub a0, s0, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-V-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 4 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 ; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 ; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 ; CHECK-RV32-V-NEXT: slli a0, a0, 1 ; CHECK-RV32-V-NEXT: add a0, a0, a1 ; CHECK-RV32-V-NEXT: sub a0, s0, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 5 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-V-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 5 ; CHECK-RV32-V-NEXT: sub a0, s0, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-V-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-V-NEXT: call otherfoo ; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 3 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 3 ; CHECK-RV32-V-NEXT: sub a0, s0, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 4 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-V-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 4 ; CHECK-RV32-V-NEXT: sub a0, s0, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 4 -; CHECK-RV32-V-NEXT: add a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 3 ; CHECK-RV32-V-NEXT: mv a1, a0 @@ -6093,81 +2998,12 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV32-V-NEXT: add a0, a0, a1 ; CHECK-RV32-V-NEXT: sub a0, s0, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 3 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 2 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: mv a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a1, a1, a0 -; CHECK-RV32-V-NEXT: slli a0, a0, 1 -; CHECK-RV32-V-NEXT: add a0, a0, a1 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-V-NEXT: csrr a0, vlenb -; CHECK-RV32-V-NEXT: slli a1, a0, 5 -; CHECK-RV32-V-NEXT: sub a0, a1, a0 -; CHECK-RV32-V-NEXT: sub a0, s0, a0 -; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-V-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-V-NEXT: csrr a0, vlenb ; CHECK-RV32-V-NEXT: slli a0, a0, 5 ; CHECK-RV32-V-NEXT: sub a0, s0, a0 ; CHECK-RV32-V-NEXT: addi a0, a0, -80 -; CHECK-RV32-V-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-V-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-V-NEXT: addi sp, s0, -80 ; CHECK-RV32-V-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; CHECK-RV32-V-NEXT: lw t0, 72(sp) # 4-byte Folded Reload @@ -6234,172 +3070,15 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV32-FV-NEXT: slli a0, a0, 5 ; CHECK-RV32-FV-NEXT: sub sp, sp, a0 ; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 4 ; CHECK-RV32-FV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FV-NEXT: mv a1, a0 @@ -6407,331 +3086,36 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV32-FV-NEXT: add a0, a0, a1 ; CHECK-RV32-FV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 5 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 5 ; CHECK-RV32-FV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FV-NEXT: call otherfoo ; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 4 ; CHECK-RV32-FV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FV-NEXT: add a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 ; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 ; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 ; CHECK-RV32-FV-NEXT: slli a0, a0, 1 ; CHECK-RV32-FV-NEXT: add a0, a0, a1 ; CHECK-RV32-FV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: mv a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a1, a1, a0 -; CHECK-RV32-FV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FV-NEXT: add a0, a0, a1 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FV-NEXT: csrr a0, vlenb -; CHECK-RV32-FV-NEXT: slli a1, a0, 5 -; CHECK-RV32-FV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FV-NEXT: csrr a0, vlenb ; CHECK-RV32-FV-NEXT: slli a0, a0, 5 ; CHECK-RV32-FV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FV-NEXT: addi a0, a0, -160 -; CHECK-RV32-FV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FV-NEXT: addi sp, s0, -160 ; CHECK-RV32-FV-NEXT: lw ra, 156(sp) # 4-byte Folded Reload ; CHECK-RV32-FV-NEXT: lw t0, 152(sp) # 4-byte Folded Reload @@ -6818,172 +3202,15 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV32-FDV-NEXT: slli a0, a0, 5 ; CHECK-RV32-FDV-NEXT: sub sp, sp, a0 ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FDV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 4 ; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FDV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FDV-NEXT: mv a1, a0 @@ -6991,249 +3218,23 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV32-FDV-NEXT: add a0, a0, a1 ; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 5 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FDV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 5 ; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-FDV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-FDV-NEXT: call otherfoo ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FDV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 4 ; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV32-FDV-NEXT: add a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FDV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV32-FDV-NEXT: mv a1, a0 @@ -7241,81 +3242,12 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV32-FDV-NEXT: add a0, a0, a1 ; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: mv a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a1, a1, a0 -; CHECK-RV32-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV32-FDV-NEXT: add a0, a0, a1 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-FDV-NEXT: csrr a0, vlenb -; CHECK-RV32-FDV-NEXT: slli a1, a0, 5 -; CHECK-RV32-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FDV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FDV-NEXT: csrr a0, vlenb ; CHECK-RV32-FDV-NEXT: slli a0, a0, 5 ; CHECK-RV32-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV32-FDV-NEXT: addi a0, a0, -240 -; CHECK-RV32-FDV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-FDV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-FDV-NEXT: addi sp, s0, -240 ; CHECK-RV32-FDV-NEXT: lw ra, 236(sp) # 4-byte Folded Reload ; CHECK-RV32-FDV-NEXT: lw t0, 232(sp) # 4-byte Folded Reload @@ -8186,422 +4118,39 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV64-V-NEXT: slli a0, a0, 5 ; CHECK-RV64-V-NEXT: sub sp, sp, a0 ; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 3 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 3 ; CHECK-RV64-V-NEXT: sub a0, s0, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 4 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-V-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 4 ; CHECK-RV64-V-NEXT: sub a0, s0, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-V-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 4 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 ; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 ; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 ; CHECK-RV64-V-NEXT: slli a0, a0, 1 ; CHECK-RV64-V-NEXT: add a0, a0, a1 ; CHECK-RV64-V-NEXT: sub a0, s0, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 5 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-V-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 5 ; CHECK-RV64-V-NEXT: sub a0, s0, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-V-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-V-NEXT: call otherfoo ; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 3 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 3 ; CHECK-RV64-V-NEXT: sub a0, s0, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 4 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-V-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 4 ; CHECK-RV64-V-NEXT: sub a0, s0, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 4 -; CHECK-RV64-V-NEXT: add a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-V-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 3 ; CHECK-RV64-V-NEXT: mv a1, a0 @@ -8609,81 +4158,12 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV64-V-NEXT: add a0, a0, a1 ; CHECK-RV64-V-NEXT: sub a0, s0, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 3 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 2 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: mv a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a1, a1, a0 -; CHECK-RV64-V-NEXT: slli a0, a0, 1 -; CHECK-RV64-V-NEXT: add a0, a0, a1 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-V-NEXT: csrr a0, vlenb -; CHECK-RV64-V-NEXT: slli a1, a0, 5 -; CHECK-RV64-V-NEXT: sub a0, a1, a0 -; CHECK-RV64-V-NEXT: sub a0, s0, a0 -; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-V-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-V-NEXT: csrr a0, vlenb ; CHECK-RV64-V-NEXT: slli a0, a0, 5 ; CHECK-RV64-V-NEXT: sub a0, s0, a0 ; CHECK-RV64-V-NEXT: addi a0, a0, -160 -; CHECK-RV64-V-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-V-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-V-NEXT: addi sp, s0, -160 ; CHECK-RV64-V-NEXT: ld ra, 152(sp) # 8-byte Folded Reload ; CHECK-RV64-V-NEXT: ld t0, 144(sp) # 8-byte Folded Reload @@ -8750,172 +4230,15 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV64-FV-NEXT: slli a0, a0, 5 ; CHECK-RV64-FV-NEXT: sub sp, sp, a0 ; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 4 ; CHECK-RV64-FV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FV-NEXT: mv a1, a0 @@ -8923,331 +4246,36 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV64-FV-NEXT: add a0, a0, a1 ; CHECK-RV64-FV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 5 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 5 ; CHECK-RV64-FV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FV-NEXT: call otherfoo ; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 4 ; CHECK-RV64-FV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FV-NEXT: add a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 ; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 ; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 ; CHECK-RV64-FV-NEXT: slli a0, a0, 1 ; CHECK-RV64-FV-NEXT: add a0, a0, a1 ; CHECK-RV64-FV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: mv a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a1, a1, a0 -; CHECK-RV64-FV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FV-NEXT: add a0, a0, a1 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FV-NEXT: csrr a0, vlenb -; CHECK-RV64-FV-NEXT: slli a1, a0, 5 -; CHECK-RV64-FV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FV-NEXT: csrr a0, vlenb ; CHECK-RV64-FV-NEXT: slli a0, a0, 5 ; CHECK-RV64-FV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FV-NEXT: addi a0, a0, -240 -; CHECK-RV64-FV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FV-NEXT: addi sp, s0, -240 ; CHECK-RV64-FV-NEXT: ld ra, 232(sp) # 8-byte Folded Reload ; CHECK-RV64-FV-NEXT: ld t0, 224(sp) # 8-byte Folded Reload @@ -9334,172 +4362,15 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV64-FDV-NEXT: slli a0, a0, 5 ; CHECK-RV64-FDV-NEXT: sub sp, sp, a0 ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v0, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v2, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v4, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v6, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FDV-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 4 ; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FDV-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FDV-NEXT: mv a1, a0 @@ -9507,249 +4378,23 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV64-FDV-NEXT: add a0, a0, a1 ; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v25, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v26, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v27, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v28, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v29, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 5 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v30, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FDV-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 5 ; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vs1r.v v31, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV64-FDV-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV64-FDV-NEXT: call otherfoo ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v2, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v4, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v6, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FDV-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 4 ; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 4 -; CHECK-RV64-FDV-NEXT: add a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FDV-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 ; CHECK-RV64-FDV-NEXT: mv a1, a0 @@ -9757,81 +4402,12 @@ define void @foo_fp_with_call() #2 { ; CHECK-RV64-FDV-NEXT: add a0, a0, a1 ; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 3 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v25, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v26, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v27, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 2 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v28, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: mv a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a1, a1, a0 -; CHECK-RV64-FDV-NEXT: slli a0, a0, 1 -; CHECK-RV64-FDV-NEXT: add a0, a0, a1 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v29, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV64-FDV-NEXT: csrr a0, vlenb -; CHECK-RV64-FDV-NEXT: slli a1, a0, 5 -; CHECK-RV64-FDV-NEXT: sub a0, a1, a0 -; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 -; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v30, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FDV-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FDV-NEXT: csrr a0, vlenb ; CHECK-RV64-FDV-NEXT: slli a0, a0, 5 ; CHECK-RV64-FDV-NEXT: sub a0, s0, a0 ; CHECK-RV64-FDV-NEXT: addi a0, a0, -320 -; CHECK-RV64-FDV-NEXT: vl1r.v v31, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV64-FDV-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV64-FDV-NEXT: addi sp, s0, -320 ; CHECK-RV64-FDV-NEXT: ld ra, 312(sp) # 8-byte Folded Reload ; CHECK-RV64-FDV-NEXT: ld t0, 304(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll index b1a6d16..a06c750 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll @@ -7,18 +7,18 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 { ; RV32-LABEL: ctz_nxv4i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; RV32-NEXT: vid.v v10 -; RV32-NEXT: vmv.v.i v11, -1 ; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; RV32-NEXT: vid.v v10 +; RV32-NEXT: li a1, -1 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32-NEXT: vmsne.vi v0, v8, 0 ; RV32-NEXT: srli a0, a0, 1 ; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vmacc.vv v8, v10, v11 -; RV32-NEXT: vmv.v.i v9, 0 -; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV32-NEXT: vmadd.vx v10, a1, v8 +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmerge.vvm v8, v8, v10, v0 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: sub a0, a0, a1 @@ -28,21 +28,21 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 { ; ; RV64-LABEL: ctz_nxv4i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; RV64-NEXT: vid.v v10 -; RV64-NEXT: vmv.v.i v11, -1 ; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; RV64-NEXT: vid.v v10 +; RV64-NEXT: li a1, -1 ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64-NEXT: vmsne.vi v0, v8, 0 ; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vmacc.vv v8, v10, v11 -; RV64-NEXT: vmv.v.i v9, 0 -; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 +; RV64-NEXT: vmadd.vx v10, a1, v8 +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a1, v8 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: slli a0, a0, 48 ; RV64-NEXT: srli a0, a0, 48 ; RV64-NEXT: ret @@ -109,17 +109,17 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) { ; ; RV64-LABEL: ctz_nxv8i1_no_range: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; RV64-NEXT: vid.v v16 -; RV64-NEXT: vmv.v.i v24, -1 ; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64-NEXT: vid.v v16 +; RV64-NEXT: li a1, -1 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: vmsne.vi v0, v8, 0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vmacc.vv v8, v16, v24 -; RV64-NEXT: vmv.v.i v16, 0 -; RV64-NEXT: vmerge.vvm v8, v16, v8, v0 +; RV64-NEXT: vmadd.vx v16, a1, v8 +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: sub a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll index 20dd590..1216d30 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll @@ -35,7 +35,7 @@ define i16 @ctz_v4i32(<4 x i32> %a) { ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: li a1, 4 -; RV64-NEXT: subw a1, a1, a0 +; RV64-NEXT: sub a1, a1, a0 ; RV64-NEXT: zext.b a0, a1 ; RV64-NEXT: ret %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0) diff --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll index 1be599e4..7a1c41c 100644 --- a/llvm/test/CodeGen/RISCV/machine-combiner.ll +++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll @@ -454,7 +454,7 @@ define i32 @test_reassoc_add_sub_i32_1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-LABEL: test_reassoc_add_sub_i32_1: ; CHECK: # %bb.0: ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: subw a2, a2, a3 +; CHECK-NEXT: sub a2, a2, a3 ; CHECK-NEXT: subw a0, a0, a2 ; CHECK-NEXT: ret %t0 = add i32 %a0, %a1 @@ -467,7 +467,7 @@ define i32 @test_reassoc_add_sub_i32_2(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-LABEL: test_reassoc_add_sub_i32_2: ; CHECK: # %bb.0: ; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: subw a2, a2, a3 +; CHECK-NEXT: sub a2, a2, a3 ; CHECK-NEXT: addw a0, a0, a2 ; CHECK-NEXT: ret %t0 = add i32 %a0, %a1 diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll index 0d57e42..cd93579 100644 --- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll +++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll @@ -3780,9 +3780,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 @@ -3985,9 +3985,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll index 0caab1f..a5bdb13 100644 --- a/llvm/test/CodeGen/RISCV/memcmp.ll +++ b/llvm/test/CodeGen/RISCV/memcmp.ll @@ -4410,9 +4410,9 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_5: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lbu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 @@ -4615,9 +4615,9 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind { ; ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_size_6: ; CHECK-UNALIGNED-RV64-ZBKB: # %bb.0: # %entry -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a2, 0(a0) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a2, 0(a0) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a0, 4(a0) -; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lwu a3, 0(a1) +; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lw a3, 0(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: lhu a1, 4(a1) ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a0, a2, a0 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT: pack a1, a3, a1 diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index 27d5eaa..4c9a98c 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1080,14 +1080,14 @@ define i32 @muli32_m65(i32 %a) nounwind { ; RV64I-LABEL: muli32_m65: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 6 -; RV64I-NEXT: negw a0, a0 +; RV64I-NEXT: neg a0, a0 ; RV64I-NEXT: subw a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: muli32_m65: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a1, a0, 6 -; RV64IM-NEXT: negw a0, a0 +; RV64IM-NEXT: neg a0, a0 ; RV64IM-NEXT: subw a0, a0, a1 ; RV64IM-NEXT: ret %1 = mul i32 %a, -65 @@ -1980,14 +1980,14 @@ define i8 @muladd_demand(i8 %x, i8 %y) nounwind { ; RV64I-LABEL: muladd_demand: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 1 -; RV64I-NEXT: subw a0, a1, a0 +; RV64I-NEXT: sub a0, a1, a0 ; RV64I-NEXT: andi a0, a0, 15 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: muladd_demand: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 1 -; RV64IM-NEXT: subw a0, a1, a0 +; RV64IM-NEXT: sub a0, a1, a0 ; RV64IM-NEXT: andi a0, a0, 15 ; RV64IM-NEXT: ret %m = mul i8 %x, 14 @@ -2048,14 +2048,14 @@ define i8 @muladd_demand_2(i8 %x, i8 %y) nounwind { ; RV64I-LABEL: muladd_demand_2: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 1 -; RV64I-NEXT: subw a1, a1, a0 +; RV64I-NEXT: sub a1, a1, a0 ; RV64I-NEXT: ori a0, a1, -16 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: muladd_demand_2: ; RV64IM: # %bb.0: ; RV64IM-NEXT: slli a0, a0, 1 -; RV64IM-NEXT: subw a1, a1, a0 +; RV64IM-NEXT: sub a1, a1, a0 ; RV64IM-NEXT: ori a0, a1, -16 ; RV64IM-NEXT: ret %m = mul i8 %x, 14 diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll index fe19a4fa..da81fe5 100644 --- a/llvm/test/CodeGen/RISCV/neg-abs.ll +++ b/llvm/test/CodeGen/RISCV/neg-abs.ll @@ -179,7 +179,7 @@ define i32 @neg_abs32_multiuse(i32 %x, ptr %y) { ; RV64I: # %bb.0: ; RV64I-NEXT: sraiw a2, a0, 31 ; RV64I-NEXT: xor a0, a0, a2 -; RV64I-NEXT: subw a2, a0, a2 +; RV64I-NEXT: sub a2, a0, a2 ; RV64I-NEXT: negw a0, a2 ; RV64I-NEXT: sw a2, 0(a1) ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index 47b90a0..ba6769b 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -833,7 +833,7 @@ define i1 @usubo_ugt_i32(i32 %x, i32 %y, ptr %p) { ; RV64-NEXT: sext.w a3, a1 ; RV64-NEXT: sext.w a4, a0 ; RV64-NEXT: sltu a3, a4, a3 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: sw a0, 0(a2) ; RV64-NEXT: mv a0, a3 ; RV64-NEXT: ret @@ -860,7 +860,7 @@ define i1 @usubo_ugt_constant_op0_i8(i8 %x, ptr %p) { ; RV64: # %bb.0: ; RV64-NEXT: zext.b a2, a0 ; RV64-NEXT: li a3, 42 -; RV64-NEXT: subw a3, a3, a0 +; RV64-NEXT: sub a3, a3, a0 ; RV64-NEXT: sltiu a0, a2, 43 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: sb a3, 0(a1) @@ -890,7 +890,7 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) { ; RV64-NEXT: slli a2, a0, 48 ; RV64-NEXT: li a3, 43 ; RV64-NEXT: srli a2, a2, 48 -; RV64-NEXT: subw a3, a3, a0 +; RV64-NEXT: sub a3, a3, a0 ; RV64-NEXT: sltiu a0, a2, 44 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: sh a3, 0(a1) @@ -987,7 +987,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, ptr %p) { ; RV64-LABEL: usubo_ne_constant0_op1_i32: ; RV64: # %bb.0: ; RV64-NEXT: sext.w a2, a0 -; RV64-NEXT: negw a3, a0 +; RV64-NEXT: neg a3, a0 ; RV64-NEXT: snez a0, a2 ; RV64-NEXT: sw a3, 0(a1) ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/pr145360.ll b/llvm/test/CodeGen/RISCV/pr145360.ll index 4251ac6..1c77fad 100644 --- a/llvm/test/CodeGen/RISCV/pr145360.ll +++ b/llvm/test/CodeGen/RISCV/pr145360.ll @@ -8,7 +8,7 @@ define i32 @signed(i32 %0, ptr %1) { ; CHECK-NEXT: srliw a2, a2, 24 ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: andi a2, a2, -256 -; CHECK-NEXT: subw a2, a0, a2 +; CHECK-NEXT: sub a2, a0, a2 ; CHECK-NEXT: sraiw a0, a0, 8 ; CHECK-NEXT: sw a2, 0(a1) ; CHECK-NEXT: ret @@ -29,7 +29,7 @@ define i32 @unsigned(i32 %0, ptr %1) { ; CHECK-NEXT: srli a2, a2, 36 ; CHECK-NEXT: slli a4, a2, 5 ; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: subw a2, a2, a4 +; CHECK-NEXT: sub a2, a2, a4 ; CHECK-NEXT: srliw a4, a0, 3 ; CHECK-NEXT: add a2, a0, a2 ; CHECK-NEXT: mulw a0, a4, a3 @@ -49,7 +49,7 @@ define i32 @signed_div_first(i32 %0, ptr %1) { ; CHECK-NEXT: add a3, a0, a2 ; CHECK-NEXT: sraiw a2, a3, 8 ; CHECK-NEXT: andi a3, a3, -256 -; CHECK-NEXT: subw a0, a0, a3 +; CHECK-NEXT: sub a0, a0, a3 ; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: ret @@ -70,7 +70,7 @@ define i32 @unsigned_div_first(i32 %0, ptr %1) { ; CHECK-NEXT: srli a2, a2, 36 ; CHECK-NEXT: slli a3, a2, 5 ; CHECK-NEXT: slli a4, a2, 3 -; CHECK-NEXT: subw a4, a4, a3 +; CHECK-NEXT: sub a4, a4, a3 ; CHECK-NEXT: add a0, a0, a4 ; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: mv a0, a2 diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir index e05e27a..b8ff783 100644 --- a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir +++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir @@ -239,8 +239,8 @@ body: | ; NO-PREFER-W-INST-NEXT: {{ $}} ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 - ; NO-PREFER-W-INST-NEXT: [[LWU:%[0-9]+]]:gpr = LWU [[COPY]], 0 - ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LWU]], 1 + ; NO-PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0 + ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1 ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] ; NO-PREFER-W-INST-NEXT: PseudoRET ; diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll index 634cca5..cf64650 100644 --- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll +++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll @@ -29,7 +29,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -56,7 +56,7 @@ define i32 @rotl_32(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -78,7 +78,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -105,7 +105,7 @@ define i32 @rotr_32(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -159,7 +159,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotl_64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -253,7 +253,7 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -307,7 +307,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotr_64: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -401,7 +401,7 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -423,7 +423,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_32_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sllw a1, a0, a1 ; RV64I-NEXT: srlw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -450,7 +450,7 @@ define i32 @rotl_32_mask(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32_mask: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -474,7 +474,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64I-LABEL: rotl_32_mask_and_63_and_31: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -500,7 +500,7 @@ define i32 @rotl_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32_mask_and_63_and_31: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -545,7 +545,7 @@ define i32 @rotl_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_32_mask_or_64_or_32: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -569,7 +569,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_32_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srlw a1, a0, a1 ; RV64I-NEXT: sllw a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -596,7 +596,7 @@ define i32 @rotr_32_mask(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32_mask: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -620,7 +620,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64I-LABEL: rotr_32_mask_and_63_and_31: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -646,7 +646,7 @@ define i32 @rotr_32_mask_and_63_and_31(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32_mask_and_63_and_31: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -691,7 +691,7 @@ define i32 @rotr_32_mask_or_64_or_32(i32 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_32_mask_or_64_or_32: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sllw a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -745,7 +745,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotl_64_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -835,7 +835,7 @@ define i64 @rotl_64_mask(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -890,7 +890,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64I-LABEL: rotl_64_mask_and_127_and_63: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -981,7 +981,7 @@ define i64 @rotl_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask_and_127_and_63: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1026,7 +1026,7 @@ define i64 @rotl_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask_or_128_or_64: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1080,7 +1080,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; ; RV64I-LABEL: rotr_64_mask: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -1170,7 +1170,7 @@ define i64 @rotr_64_mask(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1225,7 +1225,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64I-LABEL: rotr_64_mask_and_127_and_63: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -1316,7 +1316,7 @@ define i64 @rotr_64_mask_and_127_and_63(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask_and_127_and_63: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1361,7 +1361,7 @@ define i64 @rotr_64_mask_or_128_or_64(i64 %x, i64 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask_or_128_or_64: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -1390,7 +1390,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64I-LABEL: rotl_32_mask_shared: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: srlw a0, a0, a4 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sllw a1, a1, a2 @@ -1424,7 +1424,7 @@ define signext i32 @rotl_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64XTHEADBB-LABEL: rotl_32_mask_shared: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: srlw a0, a0, a4 ; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: sllw a1, a1, a2 @@ -1486,7 +1486,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64I-LABEL: rotl_64_mask_shared: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: srl a0, a0, a4 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sll a1, a1, a2 @@ -1590,7 +1590,7 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64XTHEADBB-LABEL: rotl_64_mask_shared: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: srl a0, a0, a4 ; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: sll a1, a1, a2 @@ -1618,7 +1618,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64I-LABEL: rotr_32_mask_shared: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: sllw a0, a0, a4 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sllw a1, a1, a2 @@ -1652,7 +1652,7 @@ define signext i32 @rotr_32_mask_shared(i32 signext %a, i32 signext %b, i32 sign ; RV64XTHEADBB-LABEL: rotr_32_mask_shared: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: sllw a0, a0, a4 ; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: sllw a1, a1, a2 @@ -1713,7 +1713,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64I-LABEL: rotr_64_mask_shared: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: sll a0, a0, a4 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sll a1, a1, a2 @@ -1816,7 +1816,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign ; RV64XTHEADBB-LABEL: rotr_64_mask_shared: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: sll a0, a0, a4 ; RV64XTHEADBB-NEXT: or a0, a3, a0 ; RV64XTHEADBB-NEXT: sll a1, a1, a2 @@ -1846,7 +1846,7 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I-LABEL: rotl_32_mask_multiple: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: sllw a2, a1, a2 ; RV64I-NEXT: srlw a0, a0, a4 ; RV64I-NEXT: srlw a1, a1, a4 @@ -1884,7 +1884,7 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB-LABEL: rotl_32_mask_multiple: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sllw a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: sllw a2, a1, a2 ; RV64XTHEADBB-NEXT: srlw a0, a0, a4 ; RV64XTHEADBB-NEXT: srlw a1, a1, a4 @@ -1948,7 +1948,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I-LABEL: rotl_64_mask_multiple: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: sll a2, a1, a2 ; RV64I-NEXT: srl a0, a0, a4 ; RV64I-NEXT: srl a1, a1, a4 @@ -2056,7 +2056,7 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_mask_multiple: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: sll a2, a1, a2 ; RV64XTHEADBB-NEXT: srl a0, a0, a4 ; RV64XTHEADBB-NEXT: srl a1, a1, a4 @@ -2087,7 +2087,7 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64I-LABEL: rotr_32_mask_multiple: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: srlw a2, a1, a2 ; RV64I-NEXT: sllw a0, a0, a4 ; RV64I-NEXT: sllw a1, a1, a4 @@ -2125,7 +2125,7 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si ; RV64XTHEADBB-LABEL: rotr_32_mask_multiple: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srlw a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: srlw a2, a1, a2 ; RV64XTHEADBB-NEXT: sllw a0, a0, a4 ; RV64XTHEADBB-NEXT: sllw a1, a1, a4 @@ -2188,7 +2188,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64I-LABEL: rotr_64_mask_multiple: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a3, a0, a2 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: srl a2, a1, a2 ; RV64I-NEXT: sll a0, a0, a4 ; RV64I-NEXT: sll a1, a1, a4 @@ -2295,7 +2295,7 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_mask_multiple: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a3, a0, a2 -; RV64XTHEADBB-NEXT: negw a4, a2 +; RV64XTHEADBB-NEXT: neg a4, a2 ; RV64XTHEADBB-NEXT: srl a2, a1, a2 ; RV64XTHEADBB-NEXT: sll a0, a0, a4 ; RV64XTHEADBB-NEXT: sll a1, a1, a4 @@ -2353,7 +2353,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotl_64_zext: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: sll a1, a0, a1 ; RV64I-NEXT: srl a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -2447,7 +2447,7 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotl_64_zext: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: sll a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret @@ -2503,7 +2503,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; ; RV64I-LABEL: rotr_64_zext: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: srl a1, a0, a1 ; RV64I-NEXT: sll a0, a0, a2 ; RV64I-NEXT: or a0, a1, a0 @@ -2597,7 +2597,7 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind { ; RV64XTHEADBB-LABEL: rotr_64_zext: ; RV64XTHEADBB: # %bb.0: ; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: neg a1, a1 ; RV64XTHEADBB-NEXT: sll a0, a0, a1 ; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll index b8c4328..721436d 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll @@ -121,7 +121,7 @@ define signext i32 @andi_sub_cse(i32 signext %0, i32 signext %1, ptr %2) { define signext i32 @addi_sub_cse(i32 signext %0, i32 signext %1, ptr %2) { ; CHECK-LABEL: addi_sub_cse: ; CHECK: # %bb.0: -; CHECK-NEXT: subw a0, a0, a1 +; CHECK-NEXT: sub a0, a0, a1 ; CHECK-NEXT: addiw a0, a0, -8 ; CHECK-NEXT: sw a0, 0(a2) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll b/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll index dad20b2..6b4c253 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll @@ -501,14 +501,14 @@ define signext i32 @sext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind { ; RV64I-LABEL: zext_subw_aext_aext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_aext_aext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -518,14 +518,14 @@ define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind { define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind { ; RV64I-LABEL: zext_subw_aext_sext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_aext_sext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -535,14 +535,14 @@ define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind { define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind { ; RV64I-LABEL: zext_subw_aext_zext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_aext_zext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -552,14 +552,14 @@ define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind { define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind { ; RV64I-LABEL: zext_subw_sext_aext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_sext_aext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -569,14 +569,14 @@ define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind { define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: zext_subw_sext_sext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_sext_sext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -586,14 +586,14 @@ define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind { ; RV64I-LABEL: zext_subw_sext_zext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_sext_zext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -603,14 +603,14 @@ define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind { ; RV64I-LABEL: zext_subw_zext_aext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_zext_aext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -620,14 +620,14 @@ define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind { define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind { ; RV64I-LABEL: zext_subw_zext_sext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_zext_sext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b @@ -637,14 +637,14 @@ define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind define zeroext i32 @zext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind { ; RV64I-LABEL: zext_subw_zext_zext: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: ret ; ; RV64ZBA-LABEL: zext_subw_zext_zext: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: ret %1 = sub i32 %a, %b diff --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll index 0782018..219a5aa 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll @@ -9,7 +9,7 @@ define signext i32 @addw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin ; CHECK-NEXT: not a2, a0 ; CHECK-NEXT: addi a3, a0, 1 ; CHECK-NEXT: add a2, a2, a1 -; CHECK-NEXT: subw a1, a1, a0 +; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: addi a1, a1, -2 ; CHECK-NEXT: mul a3, a2, a3 ; CHECK-NEXT: slli a1, a1, 32 @@ -53,7 +53,7 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin ; CHECK-NEXT: bge a0, a1, .LBB1_2 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: not a2, a0 -; CHECK-NEXT: subw a3, a1, a0 +; CHECK-NEXT: sub a3, a1, a0 ; CHECK-NEXT: add a1, a2, a1 ; CHECK-NEXT: addi a3, a3, -2 ; CHECK-NEXT: mul a2, a1, a2 @@ -61,7 +61,7 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin ; CHECK-NEXT: slli a1, a1, 32 ; CHECK-NEXT: mulhu a1, a1, a3 ; CHECK-NEXT: srli a1, a1, 1 -; CHECK-NEXT: subw a0, a2, a0 +; CHECK-NEXT: sub a0, a2, a0 ; CHECK-NEXT: subw a0, a0, a1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_2: diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll index 00f7b46..81acb4f7 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll @@ -357,7 +357,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB6_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -365,16 +365,16 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -410,7 +410,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-LABEL: cttz_zero_undef_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -418,16 +418,16 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -455,7 +455,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-LABEL: findFirstSet_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -463,16 +463,16 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -508,7 +508,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-LABEL: ffs_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -516,16 +516,16 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: lui a4, %hi(.LCPI9_0) ; RV64I-NEXT: addi a4, a4, %lo(.LCPI9_0) diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index fdff4a3..b46f7cc 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -3707,7 +3707,7 @@ define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) { define i64 @regression(i32 signext %x, i32 signext %y) { ; RV64I-LABEL: regression: ; RV64I: # %bb.0: -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: srli a1, a0, 29 ; RV64I-NEXT: srli a0, a0, 27 @@ -3716,14 +3716,14 @@ define i64 @regression(i32 signext %x, i32 signext %y) { ; ; RV64ZBA-LABEL: regression: ; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 ; RV64ZBA-NEXT: slli.uw a0, a0, 3 ; RV64ZBA-NEXT: sh1add a0, a0, a0 ; RV64ZBA-NEXT: ret ; ; RV64XANDESPERF-LABEL: regression: ; RV64XANDESPERF: # %bb.0: -; RV64XANDESPERF-NEXT: subw a0, a0, a1 +; RV64XANDESPERF-NEXT: sub a0, a0, a1 ; RV64XANDESPERF-NEXT: slli a0, a0, 32 ; RV64XANDESPERF-NEXT: srli a0, a0, 29 ; RV64XANDESPERF-NEXT: nds.lea.h a0, a0, a0 diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll index 12fc98c..f2c95f8 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll @@ -225,7 +225,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: rol_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -243,7 +243,7 @@ define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind { ; RV64I-LABEL: rol_i32_nosext: ; RV64I: # %bb.0: ; RV64I-NEXT: sllw a3, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sw a0, 0(a2) @@ -263,7 +263,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: rol_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: negw a2, a0 +; RV64I-NEXT: neg a2, a0 ; RV64I-NEXT: sllw a0, a1, a0 ; RV64I-NEXT: srlw a1, a1, a2 ; RV64I-NEXT: or a0, a0, a1 @@ -284,7 +284,7 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: rol_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: sll a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -303,7 +303,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: ror_i32: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret @@ -321,7 +321,7 @@ define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind { ; RV64I-LABEL: ror_i32_nosext: ; RV64I: # %bb.0: ; RV64I-NEXT: srlw a3, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: or a0, a3, a0 ; RV64I-NEXT: sw a0, 0(a2) @@ -341,7 +341,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind { ; RV64I-LABEL: ror_i32_neg_constant_rhs: ; RV64I: # %bb.0: ; RV64I-NEXT: li a1, -2 -; RV64I-NEXT: negw a2, a0 +; RV64I-NEXT: neg a2, a0 ; RV64I-NEXT: srlw a0, a1, a0 ; RV64I-NEXT: sllw a1, a1, a2 ; RV64I-NEXT: or a0, a0, a1 @@ -362,7 +362,7 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: ror_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index e640727..d133f9d 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -347,7 +347,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB6_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -355,16 +355,16 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -390,7 +390,7 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind { define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-LABEL: cttz_zero_undef_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: slli a1, a0, 6 ; RV64I-NEXT: slli a2, a0, 8 @@ -398,16 +398,16 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a4, a0, 12 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a0, 16 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 18 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a4, a0, 4 -; RV64I-NEXT: subw a4, a0, a4 +; RV64I-NEXT: sub a4, a0, a4 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: slli a4, a0, 14 -; RV64I-NEXT: subw a3, a3, a4 +; RV64I-NEXT: sub a3, a3, a4 ; RV64I-NEXT: slli a4, a0, 23 -; RV64I-NEXT: subw a2, a2, a4 +; RV64I-NEXT: sub a2, a2, a4 ; RV64I-NEXT: slli a0, a0, 27 ; RV64I-NEXT: add a1, a1, a3 ; RV64I-NEXT: add a0, a2, a0 @@ -430,7 +430,7 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind { define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-LABEL: findFirstSet_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -438,16 +438,16 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a1, a1, 27 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: add a1, a3, a1 @@ -478,7 +478,7 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind { define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-LABEL: ffs_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a1, a0, a1 ; RV64I-NEXT: slli a2, a1, 6 ; RV64I-NEXT: slli a3, a1, 8 @@ -486,16 +486,16 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind { ; RV64I-NEXT: slli a5, a1, 12 ; RV64I-NEXT: add a2, a2, a3 ; RV64I-NEXT: slli a3, a1, 16 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 18 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: slli a5, a1, 4 -; RV64I-NEXT: subw a5, a1, a5 +; RV64I-NEXT: sub a5, a1, a5 ; RV64I-NEXT: add a2, a5, a2 ; RV64I-NEXT: slli a5, a1, 14 -; RV64I-NEXT: subw a4, a4, a5 +; RV64I-NEXT: sub a4, a4, a5 ; RV64I-NEXT: slli a5, a1, 23 -; RV64I-NEXT: subw a3, a3, a5 +; RV64I-NEXT: sub a3, a3, a5 ; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: lui a4, %hi(.LCPI9_0) ; RV64I-NEXT: addi a4, a4, %lo(.LCPI9_0) @@ -701,7 +701,7 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; ; RV64ZBB-LABEL: ctpop_i32_load: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: lwu a0, 0(a0) +; RV64ZBB-NEXT: lw a0, 0(a0) ; RV64ZBB-NEXT: cpopw a0, a0 ; RV64ZBB-NEXT: ret %a = load i32, ptr %p @@ -1741,7 +1741,7 @@ define i8 @sub_if_uge_i8(i8 %x, i8 %y) { ; RV64ZBB-LABEL: sub_if_uge_i8: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: zext.b a2, a0 -; RV64ZBB-NEXT: subw a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: zext.b a0, a0 ; RV64ZBB-NEXT: minu a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -1767,7 +1767,7 @@ define i16 @sub_if_uge_i16(i16 %x, i16 %y) { ; RV64ZBB-LABEL: sub_if_uge_i16: ; RV64ZBB: # %bb.0: ; RV64ZBB-NEXT: zext.h a2, a0 -; RV64ZBB-NEXT: subw a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a1 ; RV64ZBB-NEXT: zext.h a0, a0 ; RV64ZBB-NEXT: minu a0, a2, a0 ; RV64ZBB-NEXT: ret @@ -1852,7 +1852,7 @@ define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) { ; CHECK-NEXT: sltu a2, a3, a2 ; CHECK-NEXT: addi a2, a2, -1 ; CHECK-NEXT: and a1, a2, a1 -; CHECK-NEXT: subw a0, a0, a1 +; CHECK-NEXT: sub a0, a0, a1 ; CHECK-NEXT: sllw a0, a0, a1 ; CHECK-NEXT: ret %cmp = icmp ult i32 %x, %y @@ -1870,7 +1870,7 @@ define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) { ; RV64I-NEXT: sltu a4, a3, a2 ; RV64I-NEXT: addi a4, a4, -1 ; RV64I-NEXT: and a1, a4, a1 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: bltu a3, a2, .LBB68_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: li a1, 4 @@ -1980,7 +1980,7 @@ define i32 @sub_if_uge_C_i32(i32 signext %x) { ; RV64I-NEXT: lui a2, 1048560 ; RV64I-NEXT: addi a1, a1, -16 ; RV64I-NEXT: sltu a1, a1, a0 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: addi a2, a2, 15 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: addw a0, a0, a1 @@ -2036,7 +2036,7 @@ define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) { ; RV64I-NEXT: lui a3, 1048560 ; RV64I-NEXT: addi a2, a2, -16 ; RV64I-NEXT: sltu a2, a2, a0 -; RV64I-NEXT: negw a4, a2 +; RV64I-NEXT: neg a4, a2 ; RV64I-NEXT: addi a3, a3, 15 ; RV64I-NEXT: and a3, a4, a3 ; RV64I-NEXT: addw a0, a0, a3 diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll index 696c2a5..818ea72 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll @@ -114,7 +114,7 @@ define i64 @pack_i64_3(ptr %0, ptr %1) { ; RV64ZBKB-LABEL: pack_i64_3: ; RV64ZBKB: # %bb.0: ; RV64ZBKB-NEXT: lw a0, 0(a0) -; RV64ZBKB-NEXT: lwu a1, 0(a1) +; RV64ZBKB-NEXT: lw a1, 0(a1) ; RV64ZBKB-NEXT: pack a0, a1, a0 ; RV64ZBKB-NEXT: ret %3 = load i32, ptr %0, align 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll index 96c349d..d166a6e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/callee-saved-regs.ll @@ -92,6 +92,150 @@ entry: ret <vscale x 1 x i32> %va } +define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee2(<vscale x 1 x i32> %va) nounwind { +; SPILL-O2-LABEL: test_vector_callee2: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: li a1, 12 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: sub sp, sp, a0 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: li a1, 11 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: li a1, 10 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vs1r.v v3, (a0) # vscale x 8-byte Folded Spill +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a1, a0, 3 +; SPILL-O2-NEXT: add a0, a1, a0 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vs1r.v v5, (a0) # vscale x 8-byte Folded Spill +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 3 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vs1r.v v7, (a0) # vscale x 8-byte Folded Spill +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: li a1, 11 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: li a1, 10 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vl1r.v v3, (a0) # vscale x 8-byte Folded Reload +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a1, a0, 3 +; SPILL-O2-NEXT: add a0, a1, a0 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vl1r.v v5, (a0) # vscale x 8-byte Folded Reload +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 3 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vl1r.v v7, (a0) # vscale x 8-byte Folded Reload +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: li a1, 12 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + call void asm sideeffect "", + "~{v1},~{v3},~{v5},~{v7},~{v24m2},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + + ret <vscale x 1 x i32> %va +} + +define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee3(<vscale x 1 x i32> %va) nounwind { +; SPILL-O2-LABEL: test_vector_callee3: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: li a1, 10 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: sub sp, sp, a0 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a1, a0, 3 +; SPILL-O2-NEXT: add a0, a1, a0 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vs1r.v v1, (a0) # vscale x 8-byte Folded Spill +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 3 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vs1r.v v24, (a0) # vscale x 8-byte Folded Spill +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: li a1, 6 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vs2r.v v2, (a0) # vscale x 16-byte Folded Spill +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 2 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vs2r.v v26, (a0) # vscale x 16-byte Folded Spill +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: vs4r.v v28, (a0) # vscale x 32-byte Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a1, a0, 3 +; SPILL-O2-NEXT: add a0, a1, a0 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vl1r.v v1, (a0) # vscale x 8-byte Folded Reload +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 3 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vl1r.v v24, (a0) # vscale x 8-byte Folded Reload +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: li a1, 6 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vl2r.v v2, (a0) # vscale x 16-byte Folded Reload +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 2 +; SPILL-O2-NEXT: add a0, sp, a0 +; SPILL-O2-NEXT: addi a0, a0, 16 +; SPILL-O2-NEXT: vl2r.v v26, (a0) # vscale x 16-byte Folded Reload +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: vl4r.v v28, (a0) # vscale x 32-byte Folded Reload +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: li a1, 10 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + call void asm sideeffect "", + "~{v1},~{v2},~{v3},~{v24},~{v26m2},~{v28m2},~{v29},~{v30},~{v31}"() + + ret <vscale x 1 x i32> %va +} + ; Make sure the local stack allocation pass doesn't count vector registers. The ; sizes are chosen to be on the edge of what RISCVRegister::needsFrameBaseReg ; considers to need a virtual base register. diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll index 5b82b27..81b2b65 100644 --- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll +++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll @@ -63,10 +63,10 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) { ; RV64-NEXT: and a2, t4, a2 ; RV64-NEXT: and t0, t3, t1 ; RV64-NEXT: and a7, t2, a7 -; RV64-NEXT: negw a7, a7 -; RV64-NEXT: negw t0, t0 -; RV64-NEXT: negw a2, a2 -; RV64-NEXT: negw a3, a3 +; RV64-NEXT: neg a7, a7 +; RV64-NEXT: neg t0, t0 +; RV64-NEXT: neg a2, a2 +; RV64-NEXT: neg a3, a3 ; RV64-NEXT: and a4, a7, a4 ; RV64-NEXT: and a6, t0, a6 ; RV64-NEXT: and a1, a2, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll index 07aa05f..48845c5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll @@ -930,7 +930,7 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt ; CHECK-NEXT: add a2, a0, a4 ; CHECK-NEXT: slli a5, a4, 2 ; CHECK-NEXT: add a1, a1, a4 -; CHECK-NEXT: subw a3, a3, a4 +; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: add a1, a1, a5 ; CHECK-NEXT: slli a3, a3, 32 ; CHECK-NEXT: srli a3, a3, 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll index b6253c6..dcf1ab0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -204,7 +204,7 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> % ; RV64-SLOW-NEXT: # %bb.1: # %cond.load ; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, tu, ma ; RV64-SLOW-NEXT: vmv.x.s a1, v8 -; RV64-SLOW-NEXT: lwu a2, 4(a1) +; RV64-SLOW-NEXT: lw a2, 4(a1) ; RV64-SLOW-NEXT: lwu a1, 0(a1) ; RV64-SLOW-NEXT: slli a2, a2, 32 ; RV64-SLOW-NEXT: or a1, a2, a1 @@ -216,7 +216,7 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> % ; RV64-SLOW-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1 ; RV64-SLOW-NEXT: vmv.x.s a0, v8 -; RV64-SLOW-NEXT: lwu a1, 4(a0) +; RV64-SLOW-NEXT: lw a1, 4(a0) ; RV64-SLOW-NEXT: lwu a0, 0(a0) ; RV64-SLOW-NEXT: slli a1, a1, 32 ; RV64-SLOW-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll index 1a716f6..e89bac5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -818,7 +818,7 @@ define <2 x i64> @vwaddu_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: lwu a0, 0(a1) +; RV64-NEXT: lw a0, 0(a1) ; RV64-NEXT: vwaddu.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll index 8ebd93e..b933ef9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -853,7 +853,7 @@ define <2 x i64> @vwmulsu_vx_v2i64_i32(ptr %x, ptr %y) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: lwu a0, 0(a1) +; RV64-NEXT: lw a0, 0(a1) ; RV64-NEXT: vwmulsu.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll index 90e9ffd..7cedee5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll @@ -710,13 +710,6 @@ define <4 x i32> @vwmulu_vx_v4i32_i8(ptr %x, ptr %y) { } define <4 x i32> @vwmulu_vx_v4i32_i16(ptr %x, ptr %y) { -; CHECK-LABEL: vwmulu_vx_v4i32_i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: lhu a0, 0(a1) -; CHECK-NEXT: vwmulu.vx v8, v9, a0 -; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i16, ptr %y %c = zext i16 %b to i32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll index bfdda47..86ac038e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -821,7 +821,7 @@ define <2 x i64> @vwsubu_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; ; RV64-LABEL: vwsubu_vx_v2i64_i32: ; RV64: # %bb.0: -; RV64-NEXT: lwu a1, 0(a1) +; RV64-NEXT: lw a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vle32.v v9, (a0) ; RV64-NEXT: vmv.v.x v10, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index f9ac53b..f481f9c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -274,10 +274,10 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-NOV-NEXT: sgtz a6, a2 ; CHECK-NOV-NEXT: sgtz a7, a3 ; CHECK-NOV-NEXT: sgtz t0, a5 -; CHECK-NOV-NEXT: negw t0, t0 -; CHECK-NOV-NEXT: negw a7, a7 -; CHECK-NOV-NEXT: negw a6, a6 -; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: neg t0, t0 +; CHECK-NOV-NEXT: neg a7, a7 +; CHECK-NOV-NEXT: neg a6, a6 +; CHECK-NOV-NEXT: neg a4, a4 ; CHECK-NOV-NEXT: and a5, t0, a5 ; CHECK-NOV-NEXT: and a3, a7, a3 ; CHECK-NOV-NEXT: and a2, a6, a2 @@ -755,10 +755,10 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NOV-NEXT: sgtz a4, s1 ; CHECK-NOV-NEXT: sgtz a5, a1 ; CHECK-NOV-NEXT: sgtz a6, a3 -; CHECK-NOV-NEXT: negw a6, a6 -; CHECK-NOV-NEXT: negw a5, a5 -; CHECK-NOV-NEXT: negw a4, a4 -; CHECK-NOV-NEXT: negw a2, a2 +; CHECK-NOV-NEXT: neg a6, a6 +; CHECK-NOV-NEXT: neg a5, a5 +; CHECK-NOV-NEXT: neg a4, a4 +; CHECK-NOV-NEXT: neg a2, a2 ; CHECK-NOV-NEXT: and a3, a6, a3 ; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: and a4, a4, s1 @@ -1166,10 +1166,10 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-NOV-NEXT: sgtz a6, a2 ; CHECK-NOV-NEXT: sgtz a7, a3 ; CHECK-NOV-NEXT: sgtz t0, a5 -; CHECK-NOV-NEXT: negw t0, t0 -; CHECK-NOV-NEXT: negw a7, a7 -; CHECK-NOV-NEXT: negw a6, a6 -; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: neg t0, t0 +; CHECK-NOV-NEXT: neg a7, a7 +; CHECK-NOV-NEXT: neg a6, a6 +; CHECK-NOV-NEXT: neg a4, a4 ; CHECK-NOV-NEXT: and a5, t0, a5 ; CHECK-NOV-NEXT: and a3, a7, a3 ; CHECK-NOV-NEXT: and a2, a6, a2 @@ -2040,14 +2040,14 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NOV-NEXT: sgtz t4, a5 ; CHECK-NOV-NEXT: sgtz t5, a6 ; CHECK-NOV-NEXT: sgtz t6, a7 -; CHECK-NOV-NEXT: negw t6, t6 -; CHECK-NOV-NEXT: negw t5, t5 -; CHECK-NOV-NEXT: negw t4, t4 -; CHECK-NOV-NEXT: negw t3, t3 -; CHECK-NOV-NEXT: negw t2, t2 -; CHECK-NOV-NEXT: negw t1, t1 -; CHECK-NOV-NEXT: negw t0, t0 -; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: neg t6, t6 +; CHECK-NOV-NEXT: neg t5, t5 +; CHECK-NOV-NEXT: neg t4, t4 +; CHECK-NOV-NEXT: neg t3, t3 +; CHECK-NOV-NEXT: neg t2, t2 +; CHECK-NOV-NEXT: neg t1, t1 +; CHECK-NOV-NEXT: neg t0, t0 +; CHECK-NOV-NEXT: neg a4, a4 ; CHECK-NOV-NEXT: and a7, t6, a7 ; CHECK-NOV-NEXT: and a6, t5, a6 ; CHECK-NOV-NEXT: and a5, t4, a5 @@ -3830,16 +3830,16 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB32_5: # %entry ; CHECK-NOV-NEXT: sgtz a3, a5 -; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: neg a3, a3 ; CHECK-NOV-NEXT: and a3, a3, a5 ; CHECK-NOV-NEXT: sgtz a5, a4 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a4, a5, a4 ; CHECK-NOV-NEXT: sgtz a5, a2 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a2, a5, a2 ; CHECK-NOV-NEXT: sgtz a5, a1 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: sw a3, 0(a0) ; CHECK-NOV-NEXT: sw a4, 4(a0) @@ -4306,16 +4306,16 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NOV-NEXT: mv a3, a2 ; CHECK-NOV-NEXT: .LBB35_5: # %entry ; CHECK-NOV-NEXT: sgtz a2, a3 -; CHECK-NOV-NEXT: negw a2, a2 +; CHECK-NOV-NEXT: neg a2, a2 ; CHECK-NOV-NEXT: and a2, a2, a3 ; CHECK-NOV-NEXT: sgtz a3, a1 -; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: neg a3, a3 ; CHECK-NOV-NEXT: and a1, a3, a1 ; CHECK-NOV-NEXT: sgtz a3, s1 -; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: neg a3, a3 ; CHECK-NOV-NEXT: and a3, a3, s1 ; CHECK-NOV-NEXT: sgtz a4, a0 -; CHECK-NOV-NEXT: negw a4, a4 +; CHECK-NOV-NEXT: neg a4, a4 ; CHECK-NOV-NEXT: and a0, a4, a0 ; CHECK-NOV-NEXT: sw a2, 0(s0) ; CHECK-NOV-NEXT: sw a1, 4(s0) @@ -4707,16 +4707,16 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-NOV-NEXT: mv a5, a3 ; CHECK-NOV-NEXT: .LBB41_5: # %entry ; CHECK-NOV-NEXT: sgtz a3, a5 -; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: neg a3, a3 ; CHECK-NOV-NEXT: and a3, a3, a5 ; CHECK-NOV-NEXT: sgtz a5, a4 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a4, a5, a4 ; CHECK-NOV-NEXT: sgtz a5, a2 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a2, a5, a2 ; CHECK-NOV-NEXT: sgtz a5, a1 -; CHECK-NOV-NEXT: negw a5, a5 +; CHECK-NOV-NEXT: neg a5, a5 ; CHECK-NOV-NEXT: and a1, a5, a1 ; CHECK-NOV-NEXT: sh a3, 0(a0) ; CHECK-NOV-NEXT: sh a4, 2(a0) @@ -5572,28 +5572,28 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NOV-NEXT: mv a7, a3 ; CHECK-NOV-NEXT: .LBB44_9: # %entry ; CHECK-NOV-NEXT: sgtz a3, a7 -; CHECK-NOV-NEXT: negw a3, a3 +; CHECK-NOV-NEXT: neg a3, a3 ; CHECK-NOV-NEXT: and a3, a3, a7 ; CHECK-NOV-NEXT: sgtz a7, a6 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a6, a7, a6 ; CHECK-NOV-NEXT: sgtz a7, a5 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a5, a7, a5 ; CHECK-NOV-NEXT: sgtz a7, a4 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a4, a7, a4 ; CHECK-NOV-NEXT: sgtz a7, a2 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a2, a7, a2 ; CHECK-NOV-NEXT: sgtz a7, a1 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a1, a7, a1 ; CHECK-NOV-NEXT: sgtz a7, s1 -; CHECK-NOV-NEXT: negw a7, a7 +; CHECK-NOV-NEXT: neg a7, a7 ; CHECK-NOV-NEXT: and a7, a7, s1 ; CHECK-NOV-NEXT: sgtz t0, a0 -; CHECK-NOV-NEXT: negw t0, t0 +; CHECK-NOV-NEXT: neg t0, t0 ; CHECK-NOV-NEXT: and a0, t0, a0 ; CHECK-NOV-NEXT: sh a2, 8(s0) ; CHECK-NOV-NEXT: sh a1, 10(s0) diff --git a/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll b/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll index af2e8d3..42c2556 100644 --- a/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll +++ b/llvm/test/CodeGen/RISCV/rvv/interrupt-attr-nocall.ll @@ -14,12 +14,8 @@ define void @foo_lmul1() nounwind #0 { ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 1 ; CHECK-RV32-NEXT: sub sp, sp, a0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill ; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, %hi(a) ; CHECK-RV32-NEXT: addi a0, a0, %lo(a) ; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -31,12 +27,8 @@ define void @foo_lmul1() nounwind #0 { ; CHECK-RV32-NEXT: lui a0, %hi(c) ; CHECK-RV32-NEXT: addi a0, a0, %lo(c) ; CHECK-RV32-NEXT: vse32.v v8, (a0) -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload ; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-NEXT: vl2r.v v8, (a0) # vscale x 16-byte Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 1 ; CHECK-RV32-NEXT: add sp, sp, a0 @@ -62,25 +54,8 @@ define void @foo_lmul2() nounwind #0 { ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 2 ; CHECK-RV32-NEXT: sub sp, sp, a0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: slli a1, a0, 1 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill ; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-NEXT: vs4r.v v8, (a0) # vscale x 32-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, %hi(d) ; CHECK-RV32-NEXT: addi a0, a0, %lo(d) ; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma @@ -92,25 +67,8 @@ define void @foo_lmul2() nounwind #0 { ; CHECK-RV32-NEXT: lui a0, %hi(f) ; CHECK-RV32-NEXT: addi a0, a0, %lo(f) ; CHECK-RV32-NEXT: vse32.v v8, (a0) -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: slli a1, a0, 1 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload ; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-NEXT: vl4r.v v8, (a0) # vscale x 32-byte Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 2 ; CHECK-RV32-NEXT: add sp, sp, a0 @@ -136,56 +94,8 @@ define void @foo_lmul4() nounwind #0 { ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: sub sp, sp, a0 -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: slli a1, a0, 3 -; CHECK-RV32-NEXT: sub a0, a1, a0 -; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: slli a1, a0, 2 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: slli a1, a0, 1 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill ; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, %hi(g) ; CHECK-RV32-NEXT: addi a0, a0, %lo(g) ; CHECK-RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma @@ -197,50 +107,8 @@ define void @foo_lmul4() nounwind #0 { ; CHECK-RV32-NEXT: lui a0, %hi(i) ; CHECK-RV32-NEXT: addi a0, a0, %lo(i) ; CHECK-RV32-NEXT: vse32.v v8, (a0) -; CHECK-RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 3 -; CHECK-RV32-NEXT: sub a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 2 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 1 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload ; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add sp, sp, a0 @@ -268,108 +136,12 @@ define void @foo_lmul8() nounwind #0 { ; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: sub sp, sp, a0 ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 4 -; CHECK-RV32-NEXT: sub a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a1, a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: add a1, a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v10, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a1, a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v12, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v13, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 3 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v14, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v15, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 3 -; CHECK-RV32-NEXT: sub a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v16, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v17, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 2 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v18, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v19, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 1 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v20, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v21, (a0) # vscale x 8-byte Folded Spill -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vs1r.v v22, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vs1r.v v23, (a0) # vscale x 8-byte Folded Spill +; CHECK-RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-RV32-NEXT: lui a0, %hi(j) ; CHECK-RV32-NEXT: addi a0, a0, %lo(j) ; CHECK-RV32-NEXT: li a1, 32 @@ -383,108 +155,12 @@ define void @foo_lmul8() nounwind #0 { ; CHECK-RV32-NEXT: addi a0, a0, %lo(l) ; CHECK-RV32-NEXT: vse32.v v8, (a0) ; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 4 -; CHECK-RV32-NEXT: sub a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a1, a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: add a1, a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a1, a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v12, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v13, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 3 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v14, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 3 ; CHECK-RV32-NEXT: add a0, sp, a0 ; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v15, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 3 -; CHECK-RV32-NEXT: sub a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v16, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: mv a1, a0 -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, a0, a1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v17, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 2 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v18, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 2 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v19, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a1, a0, 1 -; CHECK-RV32-NEXT: add a0, a1, a0 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v20, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: slli a0, a0, 1 -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v21, (a0) # vscale x 8-byte Folded Reload -; CHECK-RV32-NEXT: csrr a0, vlenb -; CHECK-RV32-NEXT: add a0, sp, a0 -; CHECK-RV32-NEXT: addi a0, a0, 16 -; CHECK-RV32-NEXT: vl1r.v v22, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-NEXT: addi a0, sp, 16 -; CHECK-RV32-NEXT: vl1r.v v23, (a0) # vscale x 8-byte Folded Reload +; CHECK-RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-RV32-NEXT: csrr a0, vlenb ; CHECK-RV32-NEXT: slli a0, a0, 4 ; CHECK-RV32-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll index 4d9a6ae..749b2041 100644 --- a/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll +++ b/llvm/test/CodeGen/RISCV/rvv/known-never-zero.ll @@ -11,7 +11,7 @@ define i32 @vscale_known_nonzero() { ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: negw a1, a0 +; CHECK-NEXT: neg a1, a0 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: slli a1, a0, 6 ; CHECK-NEXT: slli a2, a0, 8 @@ -19,16 +19,16 @@ define i32 @vscale_known_nonzero() { ; CHECK-NEXT: slli a4, a0, 12 ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: slli a2, a0, 16 -; CHECK-NEXT: subw a3, a3, a4 +; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: slli a4, a0, 18 -; CHECK-NEXT: subw a2, a2, a4 +; CHECK-NEXT: sub a2, a2, a4 ; CHECK-NEXT: slli a4, a0, 4 -; CHECK-NEXT: subw a4, a0, a4 +; CHECK-NEXT: sub a4, a0, a4 ; CHECK-NEXT: add a1, a4, a1 ; CHECK-NEXT: slli a4, a0, 14 -; CHECK-NEXT: subw a3, a3, a4 +; CHECK-NEXT: sub a3, a3, a4 ; CHECK-NEXT: slli a4, a0, 23 -; CHECK-NEXT: subw a2, a2, a4 +; CHECK-NEXT: sub a2, a2, a4 ; CHECK-NEXT: slli a0, a0, 27 ; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: add a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir index a050034..a7eaf39 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir @@ -78,12 +78,12 @@ body: | ; CHECK-NEXT: %false:vrnov0 = COPY $v9 ; CHECK-NEXT: %mask:vmv0 = COPY $v0 ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ - ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 8, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 4, 5 /* e32 */, 0 /* tu, mu */ %pt:vrnov0 = COPY $v8 %false:vrnov0 = COPY $v9 %mask:vmv0 = COPY $v0 - %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ - %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */ + %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 8, 5 /* e32 */, 0 /* tu, mu */ + %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 4, 5 /* e32 */ ... --- # Shouldn't be converted because false operands are different @@ -163,3 +163,47 @@ body: | %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */ bb.1: %5:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */ +... +--- +# Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v +name: preserve_false +body: | + bb.0: + liveins: $v8, $v9, $v0, $x8, $x9 + ; CHECK-LABEL: name: preserve_false + ; CHECK: liveins: $v8, $v9, $v0, $x8, $x9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %pt:vrnov0 = COPY $v8 + ; CHECK-NEXT: %false:vr = COPY $v9 + ; CHECK-NEXT: %mask:vmv0 = COPY $v0 + ; CHECK-NEXT: %avl1:gprnox0 = COPY $x8 + ; CHECK-NEXT: %avl2:gprnox0 = COPY $x9 + ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */ + ; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */ + %pt:vrnov0 = COPY $v8 + %false:vr = COPY $v9 + %mask:vmv0 = COPY $v0 + %avl1:gprnox0 = COPY $x8 + %avl2:gprnox0 = COPY $x9 + %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */ + %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */ +... +--- +# But we can convert this one because vmerge's avl being <= true's means we don't lose any false elements past avl. +name: preserve_false_avl_known_le +body: | + bb.0: + liveins: $v8, $v9, $v0 + ; CHECK-LABEL: name: preserve_false_avl_known_le + ; CHECK: liveins: $v8, $v9, $v0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %pt:vr = COPY $v8 + ; CHECK-NEXT: %false:vrnov0 = COPY $v9 + ; CHECK-NEXT: %mask:vmv0 = COPY $v0 + ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 1, 5 /* e32 */, 3 /* ta, ma */ + ; CHECK-NEXT: [[PseudoVMV_V_V_M1_:%[0-9]+]]:vr = PseudoVMV_V_V_M1 %pt, %true, 1, 5 /* e32 */, 0 /* tu, mu */ + %pt:vrnov0 = COPY $v8 + %false:vr = COPY $v9 + %mask:vmv0 = COPY $v0 + %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 2, 5 /* e32 */, 3 /* ta, ma */ + %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 1, 5 /* e32 */ diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll index 3aeb4e8..9ffc84a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll @@ -71,10 +71,31 @@ define <vscale x 8 x i64> @vpmerge_m8(<vscale x 8 x i64> %x, <vscale x 8 x i64> ret <vscale x 8 x i64> %1 } -declare <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1>, <vscale x 1 x i8>, <vscale x 1 x i8>, i32) -declare <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1>, <vscale x 2 x i8>, <vscale x 2 x i8>, i32) -declare <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1>, <vscale x 4 x i8>, <vscale x 4 x i8>, i32) -declare <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1>, <vscale x 8 x i8>, <vscale x 8 x i8>, i32) -declare <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) -declare <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1>, <vscale x 8 x i32>, <vscale x 8 x i32>, i32) -declare <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1>, <vscale x 8 x i64>, <vscale x 8 x i64>, i32) +; Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v +define <vscale x 2 x i32> @preserve_false(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask, i64 %avl1, i64 %avl2) { +; CHECK-LABEL: preserve_false: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: vle32.v v10, (a0), v0.t +; CHECK-NEXT: vsetvli zero, a2, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 +; CHECK-NEXT: ret + %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 %avl1, i64 3) + %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 %avl2) + ret <vscale x 2 x i32> %res +} + +; Can fold this because its avl is known to be <= than true, so no elements from false need to be introduced past avl. +define <vscale x 2 x i32> @preserve_false_avl_known_le(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask) { +; CHECK-LABEL: preserve_false_avl_known_le: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v9, (a0), v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret + %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 2, i64 3) + %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 1) + ret <vscale x 2 x i32> %res +} diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll index 8495dfe..32892bc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,CHECK32,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,CHECK64,ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,CHECK32,ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,CHECK64,ZVFHMIN declare <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, metadata, <vscale x 1 x i1>, i32) @@ -4820,6 +4820,427 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f64(<vscale x 8 x double> %va, do declare <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double>, <vscale x 32 x double>, metadata, <vscale x 32 x i1>, i32) define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vscale x 32 x double> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) { +; CHECK32-LABEL: fcmp_oeq_vv_nxv32f64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: addi sp, sp, -48 +; CHECK32-NEXT: .cfi_def_cfa_offset 48 +; CHECK32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; CHECK32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; CHECK32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill +; CHECK32-NEXT: sw s2, 32(sp) # 4-byte Folded Spill +; CHECK32-NEXT: sw s3, 28(sp) # 4-byte Folded Spill +; CHECK32-NEXT: sw s4, 24(sp) # 4-byte Folded Spill +; CHECK32-NEXT: .cfi_offset ra, -4 +; CHECK32-NEXT: .cfi_offset s0, -8 +; CHECK32-NEXT: .cfi_offset s1, -12 +; CHECK32-NEXT: .cfi_offset s2, -16 +; CHECK32-NEXT: .cfi_offset s3, -20 +; CHECK32-NEXT: .cfi_offset s4, -24 +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: slli a1, a1, 1 +; CHECK32-NEXT: mv a3, a1 +; CHECK32-NEXT: slli a1, a1, 2 +; CHECK32-NEXT: add a3, a3, a1 +; CHECK32-NEXT: slli a1, a1, 1 +; CHECK32-NEXT: add a1, a1, a3 +; CHECK32-NEXT: sub sp, sp, a1 +; CHECK32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x1a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 26 * vlenb +; CHECK32-NEXT: mv s1, a6 +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill +; CHECK32-NEXT: mv s3, a2 +; CHECK32-NEXT: mv s2, a0 +; CHECK32-NEXT: csrr a0, vlenb +; CHECK32-NEXT: slli a1, a0, 3 +; CHECK32-NEXT: add a0, a1, a0 +; CHECK32-NEXT: add a0, sp, a0 +; CHECK32-NEXT: addi a0, a0, 16 +; CHECK32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK32-NEXT: csrr a0, vlenb +; CHECK32-NEXT: slli a0, a0, 1 +; CHECK32-NEXT: mv a1, a0 +; CHECK32-NEXT: slli a0, a0, 3 +; CHECK32-NEXT: add a0, a0, a1 +; CHECK32-NEXT: add a0, sp, a0 +; CHECK32-NEXT: addi a0, a0, 16 +; CHECK32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK32-NEXT: csrr s0, vlenb +; CHECK32-NEXT: li a1, 24 +; CHECK32-NEXT: mv a0, s0 +; CHECK32-NEXT: call __mulsi3 +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vl1r.v v6, (a1) # vscale x 8-byte Folded Reload +; CHECK32-NEXT: mv a1, a0 +; CHECK32-NEXT: slli a4, s0, 3 +; CHECK32-NEXT: srli s4, s0, 2 +; CHECK32-NEXT: srli a0, s0, 3 +; CHECK32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK32-NEXT: vslidedown.vx v7, v6, s4 +; CHECK32-NEXT: add a2, s3, a4 +; CHECK32-NEXT: vl8re64.v v16, (a2) +; CHECK32-NEXT: slli a6, s0, 4 +; CHECK32-NEXT: slli a2, s0, 1 +; CHECK32-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; CHECK32-NEXT: vslidedown.vx v0, v6, a0 +; CHECK32-NEXT: mv a3, s1 +; CHECK32-NEXT: bltu s1, a2, .LBB257_2 +; CHECK32-NEXT: # %bb.1: +; CHECK32-NEXT: mv a3, a2 +; CHECK32-NEXT: .LBB257_2: +; CHECK32-NEXT: add a5, s3, a1 +; CHECK32-NEXT: add a1, s2, a4 +; CHECK32-NEXT: vslidedown.vx v9, v7, a0 +; CHECK32-NEXT: csrr a4, vlenb +; CHECK32-NEXT: slli a7, a4, 4 +; CHECK32-NEXT: add a4, a7, a4 +; CHECK32-NEXT: add a4, sp, a4 +; CHECK32-NEXT: addi a4, a4, 16 +; CHECK32-NEXT: vs1r.v v9, (a4) # vscale x 8-byte Folded Spill +; CHECK32-NEXT: add a4, s3, a6 +; CHECK32-NEXT: vl8re64.v v24, (s3) +; CHECK32-NEXT: sub a6, a3, s0 +; CHECK32-NEXT: sltu a7, a3, a6 +; CHECK32-NEXT: addi a7, a7, -1 +; CHECK32-NEXT: and a6, a7, a6 +; CHECK32-NEXT: csrr a7, vlenb +; CHECK32-NEXT: slli t0, a7, 3 +; CHECK32-NEXT: add a7, t0, a7 +; CHECK32-NEXT: add a7, sp, a7 +; CHECK32-NEXT: addi a7, a7, 16 +; CHECK32-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload +; CHECK32-NEXT: vsetvli zero, a6, e64, m8, ta, ma +; CHECK32-NEXT: vmfeq.vv v5, v8, v16, v0.t +; CHECK32-NEXT: bltu a3, s0, .LBB257_4 +; CHECK32-NEXT: # %bb.3: +; CHECK32-NEXT: mv a3, s0 +; CHECK32-NEXT: .LBB257_4: +; CHECK32-NEXT: vmv1r.v v0, v6 +; CHECK32-NEXT: vl8re64.v v8, (a5) +; CHECK32-NEXT: csrr a5, vlenb +; CHECK32-NEXT: slli a6, a5, 3 +; CHECK32-NEXT: add a5, a6, a5 +; CHECK32-NEXT: add a5, sp, a5 +; CHECK32-NEXT: addi a5, a5, 16 +; CHECK32-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill +; CHECK32-NEXT: csrr a5, vlenb +; CHECK32-NEXT: slli a5, a5, 1 +; CHECK32-NEXT: mv a6, a5 +; CHECK32-NEXT: slli a5, a5, 3 +; CHECK32-NEXT: add a5, a5, a6 +; CHECK32-NEXT: add a5, sp, a5 +; CHECK32-NEXT: addi a5, a5, 16 +; CHECK32-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload +; CHECK32-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK32-NEXT: vmfeq.vv v8, v16, v24, v0.t +; CHECK32-NEXT: vl8re64.v v16, (a1) +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; CHECK32-NEXT: vl8re64.v v16, (a4) +; CHECK32-NEXT: sub a1, s1, a2 +; CHECK32-NEXT: sltu a2, s1, a1 +; CHECK32-NEXT: vl8re64.v v24, (s2) +; CHECK32-NEXT: addi a2, a2, -1 +; CHECK32-NEXT: and s1, a2, a1 +; CHECK32-NEXT: vsetvli zero, s4, e8, mf2, tu, ma +; CHECK32-NEXT: vslideup.vx v8, v5, a0 +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: slli a1, a1, 1 +; CHECK32-NEXT: mv a2, a1 +; CHECK32-NEXT: slli a1, a1, 3 +; CHECK32-NEXT: add a1, a1, a2 +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill +; CHECK32-NEXT: mv a1, s1 +; CHECK32-NEXT: bltu s1, s0, .LBB257_6 +; CHECK32-NEXT: # %bb.5: +; CHECK32-NEXT: mv a1, s0 +; CHECK32-NEXT: .LBB257_6: +; CHECK32-NEXT: vmv1r.v v0, v7 +; CHECK32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK32-NEXT: vmfeq.vv v8, v24, v16, v0.t +; CHECK32-NEXT: addi a1, sp, 16 +; CHECK32-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill +; CHECK32-NEXT: li a1, 3 +; CHECK32-NEXT: call __mulsi3 +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: slli a2, a1, 4 +; CHECK32-NEXT: add a1, a2, a1 +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vl1r.v v0, (a1) # vscale x 8-byte Folded Reload +; CHECK32-NEXT: csrr a1, vlenb +; CHECK32-NEXT: slli a1, a1, 1 +; CHECK32-NEXT: mv a2, a1 +; CHECK32-NEXT: slli a1, a1, 3 +; CHECK32-NEXT: add a1, a1, a2 +; CHECK32-NEXT: add a1, sp, a1 +; CHECK32-NEXT: addi a1, a1, 16 +; CHECK32-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload +; CHECK32-NEXT: addi a1, sp, 16 +; CHECK32-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload +; CHECK32-NEXT: vsetvli zero, a0, e8, mf2, tu, ma +; CHECK32-NEXT: vslideup.vx v9, v8, s4 +; CHECK32-NEXT: sub a1, s1, s0 +; CHECK32-NEXT: sltu a2, s1, a1 +; CHECK32-NEXT: addi a2, a2, -1 +; CHECK32-NEXT: and a1, a2, a1 +; CHECK32-NEXT: csrr a2, vlenb +; CHECK32-NEXT: slli a3, a2, 3 +; CHECK32-NEXT: add a2, a3, a2 +; CHECK32-NEXT: add a2, sp, a2 +; CHECK32-NEXT: addi a2, a2, 16 +; CHECK32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; CHECK32-NEXT: csrr a2, vlenb +; CHECK32-NEXT: add a2, sp, a2 +; CHECK32-NEXT: addi a2, a2, 16 +; CHECK32-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; CHECK32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK32-NEXT: vmfeq.vv v8, v24, v16, v0.t +; CHECK32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK32-NEXT: vslideup.vx v9, v8, a0 +; CHECK32-NEXT: vmv1r.v v0, v9 +; CHECK32-NEXT: csrr a0, vlenb +; CHECK32-NEXT: slli a0, a0, 1 +; CHECK32-NEXT: mv a1, a0 +; CHECK32-NEXT: slli a0, a0, 2 +; CHECK32-NEXT: add a1, a1, a0 +; CHECK32-NEXT: slli a0, a0, 1 +; CHECK32-NEXT: add a0, a0, a1 +; CHECK32-NEXT: add sp, sp, a0 +; CHECK32-NEXT: .cfi_def_cfa sp, 48 +; CHECK32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; CHECK32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; CHECK32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload +; CHECK32-NEXT: lw s2, 32(sp) # 4-byte Folded Reload +; CHECK32-NEXT: lw s3, 28(sp) # 4-byte Folded Reload +; CHECK32-NEXT: lw s4, 24(sp) # 4-byte Folded Reload +; CHECK32-NEXT: .cfi_restore ra +; CHECK32-NEXT: .cfi_restore s0 +; CHECK32-NEXT: .cfi_restore s1 +; CHECK32-NEXT: .cfi_restore s2 +; CHECK32-NEXT: .cfi_restore s3 +; CHECK32-NEXT: .cfi_restore s4 +; CHECK32-NEXT: addi sp, sp, 48 +; CHECK32-NEXT: .cfi_def_cfa_offset 0 +; CHECK32-NEXT: ret +; +; CHECK64-LABEL: fcmp_oeq_vv_nxv32f64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: addi sp, sp, -64 +; CHECK64-NEXT: .cfi_def_cfa_offset 64 +; CHECK64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; CHECK64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; CHECK64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill +; CHECK64-NEXT: sd s2, 32(sp) # 8-byte Folded Spill +; CHECK64-NEXT: sd s3, 24(sp) # 8-byte Folded Spill +; CHECK64-NEXT: sd s4, 16(sp) # 8-byte Folded Spill +; CHECK64-NEXT: .cfi_offset ra, -8 +; CHECK64-NEXT: .cfi_offset s0, -16 +; CHECK64-NEXT: .cfi_offset s1, -24 +; CHECK64-NEXT: .cfi_offset s2, -32 +; CHECK64-NEXT: .cfi_offset s3, -40 +; CHECK64-NEXT: .cfi_offset s4, -48 +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: slli a1, a1, 1 +; CHECK64-NEXT: mv a3, a1 +; CHECK64-NEXT: slli a1, a1, 2 +; CHECK64-NEXT: add a3, a3, a1 +; CHECK64-NEXT: slli a1, a1, 1 +; CHECK64-NEXT: add a1, a1, a3 +; CHECK64-NEXT: sub sp, sp, a1 +; CHECK64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x1a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 26 * vlenb +; CHECK64-NEXT: mv s1, a6 +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill +; CHECK64-NEXT: mv s3, a2 +; CHECK64-NEXT: mv s2, a0 +; CHECK64-NEXT: csrr a0, vlenb +; CHECK64-NEXT: slli a1, a0, 3 +; CHECK64-NEXT: add a0, a1, a0 +; CHECK64-NEXT: add a0, sp, a0 +; CHECK64-NEXT: addi a0, a0, 16 +; CHECK64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK64-NEXT: csrr a0, vlenb +; CHECK64-NEXT: slli a0, a0, 1 +; CHECK64-NEXT: mv a1, a0 +; CHECK64-NEXT: slli a0, a0, 3 +; CHECK64-NEXT: add a0, a0, a1 +; CHECK64-NEXT: add a0, sp, a0 +; CHECK64-NEXT: addi a0, a0, 16 +; CHECK64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK64-NEXT: csrr s0, vlenb +; CHECK64-NEXT: li a1, 24 +; CHECK64-NEXT: mv a0, s0 +; CHECK64-NEXT: call __muldi3 +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vl1r.v v6, (a1) # vscale x 8-byte Folded Reload +; CHECK64-NEXT: mv a1, a0 +; CHECK64-NEXT: slli a4, s0, 3 +; CHECK64-NEXT: srli s4, s0, 2 +; CHECK64-NEXT: srli a0, s0, 3 +; CHECK64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK64-NEXT: vslidedown.vx v7, v6, s4 +; CHECK64-NEXT: add a2, s3, a4 +; CHECK64-NEXT: vl8re64.v v16, (a2) +; CHECK64-NEXT: slli a6, s0, 4 +; CHECK64-NEXT: slli a2, s0, 1 +; CHECK64-NEXT: vsetvli a3, zero, e8, mf4, ta, ma +; CHECK64-NEXT: vslidedown.vx v0, v6, a0 +; CHECK64-NEXT: mv a3, s1 +; CHECK64-NEXT: bltu s1, a2, .LBB257_2 +; CHECK64-NEXT: # %bb.1: +; CHECK64-NEXT: mv a3, a2 +; CHECK64-NEXT: .LBB257_2: +; CHECK64-NEXT: add a5, s3, a1 +; CHECK64-NEXT: add a1, s2, a4 +; CHECK64-NEXT: vslidedown.vx v9, v7, a0 +; CHECK64-NEXT: csrr a4, vlenb +; CHECK64-NEXT: slli a7, a4, 4 +; CHECK64-NEXT: add a4, a7, a4 +; CHECK64-NEXT: add a4, sp, a4 +; CHECK64-NEXT: addi a4, a4, 16 +; CHECK64-NEXT: vs1r.v v9, (a4) # vscale x 8-byte Folded Spill +; CHECK64-NEXT: add a4, s3, a6 +; CHECK64-NEXT: vl8re64.v v24, (s3) +; CHECK64-NEXT: sub a6, a3, s0 +; CHECK64-NEXT: sltu a7, a3, a6 +; CHECK64-NEXT: addi a7, a7, -1 +; CHECK64-NEXT: and a6, a7, a6 +; CHECK64-NEXT: csrr a7, vlenb +; CHECK64-NEXT: slli t0, a7, 3 +; CHECK64-NEXT: add a7, t0, a7 +; CHECK64-NEXT: add a7, sp, a7 +; CHECK64-NEXT: addi a7, a7, 16 +; CHECK64-NEXT: vl8r.v v8, (a7) # vscale x 64-byte Folded Reload +; CHECK64-NEXT: vsetvli zero, a6, e64, m8, ta, ma +; CHECK64-NEXT: vmfeq.vv v5, v8, v16, v0.t +; CHECK64-NEXT: bltu a3, s0, .LBB257_4 +; CHECK64-NEXT: # %bb.3: +; CHECK64-NEXT: mv a3, s0 +; CHECK64-NEXT: .LBB257_4: +; CHECK64-NEXT: vmv1r.v v0, v6 +; CHECK64-NEXT: vl8re64.v v8, (a5) +; CHECK64-NEXT: csrr a5, vlenb +; CHECK64-NEXT: slli a6, a5, 3 +; CHECK64-NEXT: add a5, a6, a5 +; CHECK64-NEXT: add a5, sp, a5 +; CHECK64-NEXT: addi a5, a5, 16 +; CHECK64-NEXT: vs8r.v v8, (a5) # vscale x 64-byte Folded Spill +; CHECK64-NEXT: csrr a5, vlenb +; CHECK64-NEXT: slli a5, a5, 1 +; CHECK64-NEXT: mv a6, a5 +; CHECK64-NEXT: slli a5, a5, 3 +; CHECK64-NEXT: add a5, a5, a6 +; CHECK64-NEXT: add a5, sp, a5 +; CHECK64-NEXT: addi a5, a5, 16 +; CHECK64-NEXT: vl8r.v v16, (a5) # vscale x 64-byte Folded Reload +; CHECK64-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK64-NEXT: vmfeq.vv v8, v16, v24, v0.t +; CHECK64-NEXT: vl8re64.v v16, (a1) +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; CHECK64-NEXT: vl8re64.v v16, (a4) +; CHECK64-NEXT: sub a1, s1, a2 +; CHECK64-NEXT: sltu a2, s1, a1 +; CHECK64-NEXT: vl8re64.v v24, (s2) +; CHECK64-NEXT: addi a2, a2, -1 +; CHECK64-NEXT: and s1, a2, a1 +; CHECK64-NEXT: vsetvli zero, s4, e8, mf2, tu, ma +; CHECK64-NEXT: vslideup.vx v8, v5, a0 +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: slli a1, a1, 1 +; CHECK64-NEXT: mv a2, a1 +; CHECK64-NEXT: slli a1, a1, 3 +; CHECK64-NEXT: add a1, a1, a2 +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill +; CHECK64-NEXT: mv a1, s1 +; CHECK64-NEXT: bltu s1, s0, .LBB257_6 +; CHECK64-NEXT: # %bb.5: +; CHECK64-NEXT: mv a1, s0 +; CHECK64-NEXT: .LBB257_6: +; CHECK64-NEXT: vmv1r.v v0, v7 +; CHECK64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK64-NEXT: vmfeq.vv v8, v24, v16, v0.t +; CHECK64-NEXT: addi a1, sp, 16 +; CHECK64-NEXT: vs1r.v v8, (a1) # vscale x 8-byte Folded Spill +; CHECK64-NEXT: li a1, 3 +; CHECK64-NEXT: call __muldi3 +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: slli a2, a1, 4 +; CHECK64-NEXT: add a1, a2, a1 +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vl1r.v v0, (a1) # vscale x 8-byte Folded Reload +; CHECK64-NEXT: csrr a1, vlenb +; CHECK64-NEXT: slli a1, a1, 1 +; CHECK64-NEXT: mv a2, a1 +; CHECK64-NEXT: slli a1, a1, 3 +; CHECK64-NEXT: add a1, a1, a2 +; CHECK64-NEXT: add a1, sp, a1 +; CHECK64-NEXT: addi a1, a1, 16 +; CHECK64-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload +; CHECK64-NEXT: addi a1, sp, 16 +; CHECK64-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload +; CHECK64-NEXT: vsetvli zero, a0, e8, mf2, tu, ma +; CHECK64-NEXT: vslideup.vx v9, v8, s4 +; CHECK64-NEXT: sub a1, s1, s0 +; CHECK64-NEXT: sltu a2, s1, a1 +; CHECK64-NEXT: addi a2, a2, -1 +; CHECK64-NEXT: and a1, a2, a1 +; CHECK64-NEXT: csrr a2, vlenb +; CHECK64-NEXT: slli a3, a2, 3 +; CHECK64-NEXT: add a2, a3, a2 +; CHECK64-NEXT: add a2, sp, a2 +; CHECK64-NEXT: addi a2, a2, 16 +; CHECK64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; CHECK64-NEXT: csrr a2, vlenb +; CHECK64-NEXT: add a2, sp, a2 +; CHECK64-NEXT: addi a2, a2, 16 +; CHECK64-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; CHECK64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK64-NEXT: vmfeq.vv v8, v24, v16, v0.t +; CHECK64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK64-NEXT: vslideup.vx v9, v8, a0 +; CHECK64-NEXT: vmv1r.v v0, v9 +; CHECK64-NEXT: csrr a0, vlenb +; CHECK64-NEXT: slli a0, a0, 1 +; CHECK64-NEXT: mv a1, a0 +; CHECK64-NEXT: slli a0, a0, 2 +; CHECK64-NEXT: add a1, a1, a0 +; CHECK64-NEXT: slli a0, a0, 1 +; CHECK64-NEXT: add a0, a0, a1 +; CHECK64-NEXT: add sp, sp, a0 +; CHECK64-NEXT: .cfi_def_cfa sp, 64 +; CHECK64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; CHECK64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; CHECK64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload +; CHECK64-NEXT: ld s2, 32(sp) # 8-byte Folded Reload +; CHECK64-NEXT: ld s3, 24(sp) # 8-byte Folded Reload +; CHECK64-NEXT: ld s4, 16(sp) # 8-byte Folded Reload +; CHECK64-NEXT: .cfi_restore ra +; CHECK64-NEXT: .cfi_restore s0 +; CHECK64-NEXT: .cfi_restore s1 +; CHECK64-NEXT: .cfi_restore s2 +; CHECK64-NEXT: .cfi_restore s3 +; CHECK64-NEXT: .cfi_restore s4 +; CHECK64-NEXT: addi sp, sp, 64 +; CHECK64-NEXT: .cfi_def_cfa_offset 0 +; CHECK64-NEXT: ret %v = call <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double> %va, <vscale x 32 x double> %vb, metadata !"oeq", <vscale x 32 x i1> %m, i32 %evl) ret <vscale x 32 x i1> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index c216fb6..346e40a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -549,7 +549,7 @@ define void @sink_splat_rsub_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB10_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lw a3, 0(a2) -; CHECK-NEXT: subw a3, a1, a3 +; CHECK-NEXT: sub a3, a1, a3 ; CHECK-NEXT: sw a3, 0(a2) ; CHECK-NEXT: addi a2, a2, 4 ; CHECK-NEXT: bne a2, a0, .LBB10_6 diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll index 66e114c..f295bd8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll @@ -2300,7 +2300,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-RV64-NEXT: j .LBB98_5 ; CHECK-RV64-NEXT: .LBB98_2: # %vector.ph ; CHECK-RV64-NEXT: srli a3, a4, 1 -; CHECK-RV64-NEXT: negw a2, a3 +; CHECK-RV64-NEXT: neg a2, a3 ; CHECK-RV64-NEXT: andi a2, a2, 256 ; CHECK-RV64-NEXT: slli a4, a4, 1 ; CHECK-RV64-NEXT: mv a5, a0 @@ -2393,7 +2393,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-ZVKB-NOZBB64-NEXT: j .LBB98_5 ; CHECK-ZVKB-NOZBB64-NEXT: .LBB98_2: # %vector.ph ; CHECK-ZVKB-NOZBB64-NEXT: srli a3, a4, 1 -; CHECK-ZVKB-NOZBB64-NEXT: negw a2, a3 +; CHECK-ZVKB-NOZBB64-NEXT: neg a2, a3 ; CHECK-ZVKB-NOZBB64-NEXT: andi a2, a2, 256 ; CHECK-ZVKB-NOZBB64-NEXT: slli a4, a4, 1 ; CHECK-ZVKB-NOZBB64-NEXT: mv a5, a0 @@ -2485,7 +2485,7 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-ZVKB-ZBB64-NEXT: j .LBB98_5 ; CHECK-ZVKB-ZBB64-NEXT: .LBB98_2: # %vector.ph ; CHECK-ZVKB-ZBB64-NEXT: srli a3, a4, 1 -; CHECK-ZVKB-ZBB64-NEXT: negw a2, a3 +; CHECK-ZVKB-ZBB64-NEXT: neg a2, a3 ; CHECK-ZVKB-ZBB64-NEXT: andi a2, a2, 256 ; CHECK-ZVKB-ZBB64-NEXT: slli a4, a4, 1 ; CHECK-ZVKB-ZBB64-NEXT: mv a5, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll index 3740737..d0b184b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll @@ -50,9 +50,9 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) { ; RV64-NEXT: sgtz a5, a5 ; RV64-NEXT: sgtz a4, a4 ; RV64-NEXT: sgtz a3, a3 -; RV64-NEXT: negw a3, a3 -; RV64-NEXT: negw a4, a4 -; RV64-NEXT: negw a5, a5 +; RV64-NEXT: neg a3, a3 +; RV64-NEXT: neg a4, a4 +; RV64-NEXT: neg a5, a5 ; RV64-NEXT: and a3, a3, a6 ; RV64-NEXT: and a0, a4, a0 ; RV64-NEXT: and a2, a5, a2 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index 578b67e..f9f0aa6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -542,95 +542,30 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) { ; CHECK-LABEL: masked_load_factor2: ; CHECK: # %bb.0: -; CHECK-NEXT: vl4r.v v12, (a0) -; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v12, 0 -; CHECK-NEXT: vnsrl.wi v10, v12, 8 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison) %deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec) ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %deinterleaved.results } -define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4(ptr %p) { -; CHECK-LABEL: masked_loat_factor4: +define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_load_factor4(ptr %p) { +; CHECK-LABEL: masked_load_factor4: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: vl4r.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs4r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg4e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison) %deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec) ret {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} %deinterleaved.results } -define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4_mask(ptr %p, <vscale x 8 x i1> %mask) { -; CHECK-LABEL: masked_loat_factor4_mask: +define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_load_factor4_mask(ptr %p, <vscale x 8 x i1> %mask) { +; CHECK-LABEL: masked_load_factor4_mask: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: add a3, a1, a2 -; CHECK-NEXT: vmv.v.v v9, v8 -; CHECK-NEXT: srli a4, a2, 2 -; CHECK-NEXT: vmv.v.v v10, v8 -; CHECK-NEXT: srli a5, a2, 3 -; CHECK-NEXT: vmv.v.v v11, v8 -; CHECK-NEXT: vsseg4e8.v v8, (a1) -; CHECK-NEXT: vl1r.v v8, (a1) -; CHECK-NEXT: add a1, a4, a5 -; CHECK-NEXT: vl1r.v v9, (a3) -; CHECK-NEXT: add a3, a3, a2 -; CHECK-NEXT: add a2, a3, a2 -; CHECK-NEXT: vl1r.v v10, (a3) -; CHECK-NEXT: vl1r.v v11, (a2) -; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmsne.vi v0, v8, 0 -; CHECK-NEXT: vmsne.vi v8, v10, 0 -; CHECK-NEXT: vmsne.vi v10, v11, 0 -; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vx v0, v9, a5 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vx v0, v8, a4 -; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v0, v10, a1 -; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0), v0.t -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs4r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vlseg4e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret %interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask) %vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison) @@ -640,8 +575,8 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i ; Negative test - some of the deinterleaved elements might come from the ; passthru not the load -define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4_passthru(ptr %p, <vscale x 8 x i1> %mask, <vscale x 32 x i8> %passthru) { -; CHECK-LABEL: masked_loat_factor4_passthru: +define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_load_factor4_passthru(ptr %p, <vscale x 8 x i1> %mask, <vscale x 32 x i8> %passthru) { +; CHECK-LABEL: masked_load_factor4_passthru: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index af55aaa..7e7d11e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -303,3 +303,26 @@ define void @vector_interleave_store_factor8(<vscale x 2 x i32> %a, <vscale x 2 store <vscale x 16 x i32> %v, ptr %p ret void } + +define void @masked_store_factor3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, ptr %p) { +; CHECK-LABEL: masked_store_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %v = call <vscale x 6 x i32> @llvm.vector.interleave3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c) + call void @llvm.masked.store(<vscale x 6 x i32> %v, ptr %p, i32 4, <vscale x 6 x i1> splat (i1 true)) + ret void +} + +define void @masked_store_factor3_masked(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, ptr %p, <vscale x 2 x i1> %m) { +; CHECK-LABEL: masked_store_factor3_masked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %interleaved.mask = call <vscale x 6 x i1> @llvm.vector.interleave3(<vscale x 2 x i1> %m, <vscale x 2 x i1> %m, <vscale x 2 x i1> %m) + %v = call <vscale x 6 x i32> @llvm.vector.interleave3(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c) + call void @llvm.masked.store(<vscale x 6 x i32> %v, ptr %p, i32 4, <vscale x 6 x i1> %interleaved.mask) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll index 25a226e..eb129da 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll @@ -959,7 +959,7 @@ define <vscale x 1 x i64> @vrol_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv1i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-RV64-NEXT: vsll.vx v9, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1022,7 +1022,7 @@ define <vscale x 2 x i64> @vrol_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; CHECK-RV64-NEXT: vsll.vx v10, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1085,7 +1085,7 @@ define <vscale x 4 x i64> @vrol_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv4i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; CHECK-RV64-NEXT: vsll.vx v12, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1148,7 +1148,7 @@ define <vscale x 8 x i64> @vrol_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vrol_vx_nxv8i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; CHECK-RV64-NEXT: vsll.vx v16, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll index 9e63b61..97524ac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll @@ -1626,7 +1626,7 @@ define <vscale x 1 x i64> @vror_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv1i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v9, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1728,7 +1728,7 @@ define <vscale x 2 x i64> @vror_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v10, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1830,7 +1830,7 @@ define <vscale x 4 x i64> @vror_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv4i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v12, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 @@ -1932,7 +1932,7 @@ define <vscale x 8 x i64> @vror_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) { ; CHECK-RV64-LABEL: vror_vx_nxv8i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: andi a1, a0, 63 -; CHECK-RV64-NEXT: negw a0, a0 +; CHECK-RV64-NEXT: neg a0, a0 ; CHECK-RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; CHECK-RV64-NEXT: vsrl.vx v16, v8, a1 ; CHECK-RV64-NEXT: andi a0, a0, 63 diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll index 8eef133..4442f97 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll @@ -77,7 +77,7 @@ define i64 @con1024_minus_rem() { ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: negw a0, a0 +; CHECK-NEXT: neg a0, a0 ; CHECK-NEXT: andi a0, a0, 1024 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll index 0ea80bf..2e1784d 100644 --- a/llvm/test/CodeGen/RISCV/select.ll +++ b/llvm/test/CodeGen/RISCV/select.ll @@ -647,7 +647,7 @@ define i32 @select_add_1(i1 zeroext %cond, i32 %a, i32 %b) { ; ; RV64IM-LABEL: select_add_1: ; RV64IM: # %bb.0: # %entry -; RV64IM-NEXT: negw a0, a0 +; RV64IM-NEXT: neg a0, a0 ; RV64IM-NEXT: and a0, a0, a1 ; RV64IM-NEXT: addw a0, a2, a0 ; RV64IM-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll index b128abb..b155fea 100644 --- a/llvm/test/CodeGen/RISCV/sextw-removal.ll +++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll @@ -1048,21 +1048,21 @@ define signext i32 @bug(i32 signext %x) { ; CHECK-NEXT: srliw a2, a0, 24 ; CHECK-NEXT: seqz a2, a2 ; CHECK-NEXT: slli a3, a2, 3 -; CHECK-NEXT: negw a2, a2 +; CHECK-NEXT: neg a2, a2 ; CHECK-NEXT: sllw a0, a0, a3 ; CHECK-NEXT: andi a2, a2, -8 ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: srliw a2, a0, 28 ; CHECK-NEXT: seqz a2, a2 ; CHECK-NEXT: slli a3, a2, 2 -; CHECK-NEXT: negw a2, a2 +; CHECK-NEXT: neg a2, a2 ; CHECK-NEXT: sllw a0, a0, a3 ; CHECK-NEXT: andi a2, a2, -4 ; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: srliw a2, a0, 30 ; CHECK-NEXT: seqz a2, a2 ; CHECK-NEXT: slli a3, a2, 1 -; CHECK-NEXT: negw a2, a2 +; CHECK-NEXT: neg a2, a2 ; CHECK-NEXT: sllw a0, a0, a3 ; CHECK-NEXT: andi a2, a2, -2 ; CHECK-NEXT: add a1, a1, a2 @@ -1090,21 +1090,21 @@ define signext i32 @bug(i32 signext %x) { ; NOREMOVAL-NEXT: srliw a2, a0, 24 ; NOREMOVAL-NEXT: seqz a2, a2 ; NOREMOVAL-NEXT: slli a3, a2, 3 -; NOREMOVAL-NEXT: negw a2, a2 +; NOREMOVAL-NEXT: neg a2, a2 ; NOREMOVAL-NEXT: sllw a0, a0, a3 ; NOREMOVAL-NEXT: andi a2, a2, -8 ; NOREMOVAL-NEXT: add a1, a1, a2 ; NOREMOVAL-NEXT: srliw a2, a0, 28 ; NOREMOVAL-NEXT: seqz a2, a2 ; NOREMOVAL-NEXT: slli a3, a2, 2 -; NOREMOVAL-NEXT: negw a2, a2 +; NOREMOVAL-NEXT: neg a2, a2 ; NOREMOVAL-NEXT: sllw a0, a0, a3 ; NOREMOVAL-NEXT: andi a2, a2, -4 ; NOREMOVAL-NEXT: add a1, a1, a2 ; NOREMOVAL-NEXT: srliw a2, a0, 30 ; NOREMOVAL-NEXT: seqz a2, a2 ; NOREMOVAL-NEXT: slli a3, a2, 1 -; NOREMOVAL-NEXT: negw a2, a2 +; NOREMOVAL-NEXT: neg a2, a2 ; NOREMOVAL-NEXT: sllw a0, a0, a3 ; NOREMOVAL-NEXT: andi a2, a2, -2 ; NOREMOVAL-NEXT: add a1, a1, a2 diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll index 7ca1ee1..1ca23d7 100644 --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -383,7 +383,7 @@ define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind { ; RV64I-LABEL: fshr64_minsize: ; RV64I: # %bb.0: ; RV64I-NEXT: srl a2, a0, a1 -; RV64I-NEXT: negw a1, a1 +; RV64I-NEXT: neg a1, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: or a0, a2, a0 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll index 99dc4f8..e44d247 100644 --- a/llvm/test/CodeGen/RISCV/shl-cttz.ll +++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll @@ -40,7 +40,7 @@ define i8 @shl_cttz_i8(i8 %x, i8 %y) { ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: srli a2, a1, 1 ; RV64I-NEXT: andi a2, a2, 85 -; RV64I-NEXT: subw a1, a1, a2 +; RV64I-NEXT: sub a1, a1, a2 ; RV64I-NEXT: andi a2, a1, 51 ; RV64I-NEXT: srli a1, a1, 2 ; RV64I-NEXT: andi a1, a1, 51 @@ -96,7 +96,7 @@ define i8 @shl_cttz_constant_i8(i8 %y) { ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: andi a1, a1, 85 -; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: sub a0, a0, a1 ; RV64I-NEXT: andi a1, a0, 51 ; RV64I-NEXT: srli a0, a0, 2 ; RV64I-NEXT: andi a0, a0, 51 @@ -276,7 +276,7 @@ define i32 @shl_cttz_i32(i32 %x, i32 %y) { ; ; RV64I-LABEL: shl_cttz_i32: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 30667 ; RV64I-NEXT: addi a2, a2, 1329 @@ -333,7 +333,7 @@ define i32 @shl_cttz_i32_zero_is_defined(i32 %x, i32 %y) { ; RV64I-NEXT: sext.w a2, a1 ; RV64I-NEXT: beqz a2, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 30667 ; RV64I-NEXT: addi a2, a2, 1329 @@ -378,7 +378,7 @@ define i32 @shl_cttz_constant_i32(i32 %y) { ; ; RV64I-LABEL: shl_cttz_constant_i32: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: negw a1, a0 +; RV64I-NEXT: neg a1, a0 ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: lui a1, 30667 ; RV64I-NEXT: addi a1, a1, 1329 @@ -474,7 +474,7 @@ define i32 @shl_cttz_multiuse_i32(i32 %x, i32 %y) { ; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: .cfi_offset s0, -16 ; RV64I-NEXT: .cfi_offset s1, -24 -; RV64I-NEXT: negw a2, a1 +; RV64I-NEXT: neg a2, a1 ; RV64I-NEXT: and a1, a1, a2 ; RV64I-NEXT: lui a2, 30667 ; RV64I-NEXT: addi a2, a2, 1329 diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll index 93fb230..bc23388 100644 --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -50,7 +50,7 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV64-NEXT: add a2, a2, a4 ; RV64-NEXT: slli a4, a0, 2 ; RV64-NEXT: add a4, a0, a4 -; RV64-NEXT: subw a1, a1, a4 +; RV64-NEXT: sub a1, a1, a4 ; RV64-NEXT: slli a4, a0, 17 ; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: slli a0, a0, 23 @@ -59,8 +59,8 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; RV64-NEXT: add a1, a1, a3 ; RV64-NEXT: lui a3, 1324 ; RV64-NEXT: addi a2, a2, -83 -; RV64-NEXT: subw a0, a0, a2 -; RV64-NEXT: subw a1, a1, a0 +; RV64-NEXT: sub a0, a0, a2 +; RV64-NEXT: sub a1, a1, a0 ; RV64-NEXT: slli a1, a1, 35 ; RV64-NEXT: srli a1, a1, 35 ; RV64-NEXT: addi a0, a3, -165 @@ -189,7 +189,7 @@ define i1 @test_srem_even(i4 %X) nounwind { ; RV64M-NEXT: add a1, a1, a2 ; RV64M-NEXT: slli a2, a1, 3 ; RV64M-NEXT: slli a1, a1, 1 -; RV64M-NEXT: subw a1, a1, a2 +; RV64M-NEXT: sub a1, a1, a2 ; RV64M-NEXT: add a0, a0, a1 ; RV64M-NEXT: andi a0, a0, 15 ; RV64M-NEXT: addi a0, a0, -1 @@ -225,7 +225,7 @@ define i1 @test_srem_even(i4 %X) nounwind { ; RV64MV-NEXT: add a1, a1, a2 ; RV64MV-NEXT: slli a2, a1, 3 ; RV64MV-NEXT: slli a1, a1, 1 -; RV64MV-NEXT: subw a1, a1, a2 +; RV64MV-NEXT: sub a1, a1, a2 ; RV64MV-NEXT: add a0, a0, a1 ; RV64MV-NEXT: andi a0, a0, 15 ; RV64MV-NEXT: addi a0, a0, -1 @@ -256,7 +256,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV64-NEXT: srli a1, a1, 62 ; RV64-NEXT: add a1, a0, a1 ; RV64-NEXT: andi a1, a1, 60 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: andi a0, a0, 63 ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: ret @@ -280,7 +280,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV64M-NEXT: srli a1, a1, 62 ; RV64M-NEXT: add a1, a0, a1 ; RV64M-NEXT: andi a1, a1, 60 -; RV64M-NEXT: subw a0, a0, a1 +; RV64M-NEXT: sub a0, a0, a1 ; RV64M-NEXT: andi a0, a0, 63 ; RV64M-NEXT: snez a0, a0 ; RV64M-NEXT: ret @@ -304,7 +304,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind { ; RV64MV-NEXT: srli a1, a1, 62 ; RV64MV-NEXT: add a1, a0, a1 ; RV64MV-NEXT: andi a1, a1, 60 -; RV64MV-NEXT: subw a0, a0, a1 +; RV64MV-NEXT: sub a0, a0, a1 ; RV64MV-NEXT: andi a0, a0, 63 ; RV64MV-NEXT: snez a0, a0 ; RV64MV-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index 30ffaf6..5129ccc 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -183,10 +183,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul a5, a5, t1 ; RV64IM-NEXT: li t1, -124 ; RV64IM-NEXT: mul a6, a6, t1 -; RV64IM-NEXT: subw a4, a4, a7 -; RV64IM-NEXT: subw a1, a1, t0 -; RV64IM-NEXT: subw a3, a3, a5 -; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: sub a4, a4, a7 +; RV64IM-NEXT: sub a1, a1, t0 +; RV64IM-NEXT: sub a3, a3, a5 +; RV64IM-NEXT: sub a2, a2, a6 ; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a4, 4(a0) @@ -357,10 +357,10 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul a7, a7, t1 ; RV64IM-NEXT: mul t0, t0, t1 ; RV64IM-NEXT: mul a2, a2, t1 -; RV64IM-NEXT: subw a3, a3, a6 -; RV64IM-NEXT: subw a4, a4, a7 -; RV64IM-NEXT: subw a5, a5, t0 -; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: sub a3, a3, a6 +; RV64IM-NEXT: sub a4, a4, a7 +; RV64IM-NEXT: sub a5, a5, t0 +; RV64IM-NEXT: sub a1, a1, a2 ; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: sh a4, 2(a0) ; RV64IM-NEXT: sh a5, 4(a0) @@ -597,10 +597,10 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV64IM-NEXT: add a1, a1, t1 ; RV64IM-NEXT: add a3, a3, t0 ; RV64IM-NEXT: add a4, a4, a7 -; RV64IM-NEXT: subw a2, a2, a6 -; RV64IM-NEXT: subw a1, a1, t4 -; RV64IM-NEXT: subw a3, a3, t3 -; RV64IM-NEXT: subw a4, a4, t2 +; RV64IM-NEXT: sub a2, a2, a6 +; RV64IM-NEXT: sub a1, a1, t4 +; RV64IM-NEXT: sub a3, a3, t3 +; RV64IM-NEXT: sub a4, a4, t2 ; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a1, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) @@ -703,15 +703,15 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; RV64I-NEXT: srli a1, a2, 58 ; RV64I-NEXT: add a1, a2, a1 ; RV64I-NEXT: andi a1, a1, -64 -; RV64I-NEXT: subw s1, a2, a1 +; RV64I-NEXT: sub s1, a2, a1 ; RV64I-NEXT: srli a1, a3, 59 ; RV64I-NEXT: add a1, a3, a1 ; RV64I-NEXT: andi a1, a1, -32 -; RV64I-NEXT: subw s2, a3, a1 +; RV64I-NEXT: sub s2, a3, a1 ; RV64I-NEXT: srli a1, a4, 61 ; RV64I-NEXT: add a1, a4, a1 ; RV64I-NEXT: andi a1, a1, -8 -; RV64I-NEXT: subw s3, a4, a1 +; RV64I-NEXT: sub s3, a4, a1 ; RV64I-NEXT: li a1, 95 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: sh s1, 0(s0) @@ -737,23 +737,23 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind { ; RV64IM-NEXT: srli a6, a2, 58 ; RV64IM-NEXT: add a6, a2, a6 ; RV64IM-NEXT: andi a6, a6, -64 -; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: sub a2, a2, a6 ; RV64IM-NEXT: srli a6, a3, 59 ; RV64IM-NEXT: add a6, a3, a6 ; RV64IM-NEXT: andi a6, a6, -32 -; RV64IM-NEXT: subw a3, a3, a6 +; RV64IM-NEXT: sub a3, a3, a6 ; RV64IM-NEXT: srli a6, a4, 61 ; RV64IM-NEXT: mulh a5, a1, a5 ; RV64IM-NEXT: add a6, a4, a6 ; RV64IM-NEXT: add a5, a5, a1 ; RV64IM-NEXT: andi a6, a6, -8 -; RV64IM-NEXT: subw a4, a4, a6 +; RV64IM-NEXT: sub a4, a4, a6 ; RV64IM-NEXT: srli a6, a5, 63 ; RV64IM-NEXT: srli a5, a5, 6 ; RV64IM-NEXT: add a5, a5, a6 ; RV64IM-NEXT: li a6, 95 ; RV64IM-NEXT: mul a5, a5, a6 -; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: sub a1, a1, a5 ; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a3, 2(a0) ; RV64IM-NEXT: sh a4, 4(a0) @@ -909,9 +909,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul a6, a6, a7 ; RV64IM-NEXT: li a7, 23 ; RV64IM-NEXT: mul a4, a4, a7 -; RV64IM-NEXT: subw a2, a2, a5 -; RV64IM-NEXT: subw a1, a1, a6 -; RV64IM-NEXT: subw a3, a3, a4 +; RV64IM-NEXT: sub a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a6 +; RV64IM-NEXT: sub a3, a3, a4 ; RV64IM-NEXT: sh zero, 0(a0) ; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) @@ -1011,7 +1011,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV64I-NEXT: add a1, a2, a1 ; RV64I-NEXT: lui a3, 8 ; RV64I-NEXT: and a1, a1, a3 -; RV64I-NEXT: subw s3, a2, a1 +; RV64I-NEXT: sub s3, a2, a1 ; RV64I-NEXT: li a1, 23 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s2, a0 @@ -1050,7 +1050,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV64IM-NEXT: add a5, a5, a7 ; RV64IM-NEXT: mulh a4, a3, a4 ; RV64IM-NEXT: add a4, a4, a3 -; RV64IM-NEXT: subw a2, a2, a6 +; RV64IM-NEXT: sub a2, a2, a6 ; RV64IM-NEXT: srli a6, a4, 63 ; RV64IM-NEXT: srli a4, a4, 4 ; RV64IM-NEXT: add a4, a4, a6 @@ -1059,8 +1059,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul a5, a5, a6 ; RV64IM-NEXT: li a6, 23 ; RV64IM-NEXT: mul a4, a4, a6 -; RV64IM-NEXT: subw a1, a1, a5 -; RV64IM-NEXT: subw a3, a3, a4 +; RV64IM-NEXT: sub a1, a1, a5 +; RV64IM-NEXT: sub a3, a3, a4 ; RV64IM-NEXT: sh zero, 0(a0) ; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) diff --git a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll index 3007c35..0c13a1d 100644 --- a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll +++ b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll @@ -26,7 +26,7 @@ define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) { define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) { ; CHECK-LABEL: overflow_sub: ; CHECK: # %bb.0: -; CHECK-NEXT: subw a0, a0, a1 +; CHECK-NEXT: sub a0, a0, a1 ; CHECK-NEXT: ori a0, a0, 1 ; CHECK-NEXT: slli a0, a0, 48 ; CHECK-NEXT: srli a0, a0, 48 diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll index af5121d..ee49612 100644 --- a/llvm/test/CodeGen/RISCV/urem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -48,7 +48,7 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind { ; RV64IM-NEXT: slli a2, a2, 32 ; RV64IM-NEXT: mulhu a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 32 -; RV64IM-NEXT: subw a2, a0, a1 +; RV64IM-NEXT: sub a2, a0, a1 ; RV64IM-NEXT: srliw a2, a2, 1 ; RV64IM-NEXT: add a1, a2, a1 ; RV64IM-NEXT: srli a1, a1, 6 @@ -174,7 +174,7 @@ define i32 @combine_urem_udiv(i32 %x) nounwind { ; RV64IM-NEXT: slli a2, a2, 32 ; RV64IM-NEXT: mulhu a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 32 -; RV64IM-NEXT: subw a2, a0, a1 +; RV64IM-NEXT: sub a2, a0, a1 ; RV64IM-NEXT: srliw a2, a2, 1 ; RV64IM-NEXT: add a1, a2, a1 ; RV64IM-NEXT: li a2, 95 diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll index d33c666..636fdfa 100644 --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -31,11 +31,11 @@ define i1 @test_urem_odd(i13 %X) nounwind { ; RV64-NEXT: slli a1, a0, 4 ; RV64-NEXT: slli a2, a0, 6 ; RV64-NEXT: slli a3, a0, 8 -; RV64-NEXT: subw a1, a1, a2 +; RV64-NEXT: sub a1, a1, a2 ; RV64-NEXT: slli a2, a0, 10 -; RV64-NEXT: subw a3, a3, a2 +; RV64-NEXT: sub a3, a3, a2 ; RV64-NEXT: slli a2, a0, 2 -; RV64-NEXT: subw a2, a0, a2 +; RV64-NEXT: sub a2, a0, a2 ; RV64-NEXT: slli a0, a0, 12 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a0, a3, a0 @@ -138,10 +138,10 @@ define i1 @test_urem_even(i27 %X) nounwind { ; RV64-NEXT: slli a4, a0, 18 ; RV64-NEXT: add a3, a3, a4 ; RV64-NEXT: slli a0, a0, 27 -; RV64-NEXT: subw a0, a0, a2 +; RV64-NEXT: sub a0, a0, a2 ; RV64-NEXT: lui a2, 2341 ; RV64-NEXT: add a1, a1, a3 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: slli a1, a0, 26 ; RV64-NEXT: slli a0, a0, 37 ; RV64-NEXT: srli a0, a0, 38 @@ -234,8 +234,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; RV64-LABEL: test_urem_odd_setne: ; RV64: # %bb.0: ; RV64-NEXT: slli a1, a0, 1 -; RV64-NEXT: negw a0, a0 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: andi a0, a0, 15 ; RV64-NEXT: sltiu a0, a0, 4 ; RV64-NEXT: xori a0, a0, 1 @@ -254,8 +254,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; RV64M-LABEL: test_urem_odd_setne: ; RV64M: # %bb.0: ; RV64M-NEXT: slli a1, a0, 1 -; RV64M-NEXT: negw a0, a0 -; RV64M-NEXT: subw a0, a0, a1 +; RV64M-NEXT: neg a0, a0 +; RV64M-NEXT: sub a0, a0, a1 ; RV64M-NEXT: andi a0, a0, 15 ; RV64M-NEXT: sltiu a0, a0, 4 ; RV64M-NEXT: xori a0, a0, 1 @@ -274,8 +274,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; RV64MV-LABEL: test_urem_odd_setne: ; RV64MV: # %bb.0: ; RV64MV-NEXT: slli a1, a0, 1 -; RV64MV-NEXT: negw a0, a0 -; RV64MV-NEXT: subw a0, a0, a1 +; RV64MV-NEXT: neg a0, a0 +; RV64MV-NEXT: sub a0, a0, a1 ; RV64MV-NEXT: andi a0, a0, 15 ; RV64MV-NEXT: sltiu a0, a0, 4 ; RV64MV-NEXT: xori a0, a0, 1 @@ -306,9 +306,9 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { ; RV64-NEXT: slli a1, a0, 2 ; RV64-NEXT: slli a2, a0, 4 ; RV64-NEXT: slli a3, a0, 6 -; RV64-NEXT: subw a1, a1, a0 -; RV64-NEXT: subw a2, a2, a3 -; RV64-NEXT: subw a1, a1, a2 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: sub a2, a2, a3 +; RV64-NEXT: sub a1, a1, a2 ; RV64-NEXT: slli a0, a0, 8 ; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: andi a0, a0, 511 @@ -437,7 +437,7 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64-NEXT: addi a2, a2, -2 ; RV64-NEXT: add a1, a1, a4 ; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: subw a4, t0, a7 +; RV64-NEXT: sub a4, t0, a7 ; RV64-NEXT: slli a6, a3, 3 ; RV64-NEXT: slli a7, a3, 6 ; RV64-NEXT: slli t0, a3, 9 @@ -447,18 +447,18 @@ define void @test_urem_vec(ptr %X) nounwind { ; RV64-NEXT: slli a6, a2, 4 ; RV64-NEXT: add a7, a7, t0 ; RV64-NEXT: slli t0, a2, 6 -; RV64-NEXT: subw a6, a6, t0 +; RV64-NEXT: sub a6, a6, t0 ; RV64-NEXT: slli t0, a2, 8 -; RV64-NEXT: subw a5, a5, a2 +; RV64-NEXT: sub a5, a5, a2 ; RV64-NEXT: slli a2, a2, 10 -; RV64-NEXT: subw a2, t0, a2 -; RV64-NEXT: subw a4, a4, a1 +; RV64-NEXT: sub a2, t0, a2 +; RV64-NEXT: sub a4, a4, a1 ; RV64-NEXT: add a3, a3, a7 -; RV64-NEXT: subw a1, a5, a6 +; RV64-NEXT: sub a1, a5, a6 ; RV64-NEXT: slli a5, a4, 10 ; RV64-NEXT: slli a4, a4, 53 -; RV64-NEXT: negw a3, a3 -; RV64-NEXT: subw a1, a1, a2 +; RV64-NEXT: neg a3, a3 +; RV64-NEXT: sub a1, a1, a2 ; RV64-NEXT: srli a4, a4, 54 ; RV64-NEXT: andi a2, a3, 2047 ; RV64-NEXT: andi a1, a1, 2047 diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index 3ef9f3f..5a3dfd1 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -157,10 +157,10 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul a7, a7, t1 ; RV64IM-NEXT: slli t1, a5, 7 ; RV64IM-NEXT: slli a5, a5, 2 -; RV64IM-NEXT: subw a5, a5, t1 -; RV64IM-NEXT: subw a2, a2, a6 -; RV64IM-NEXT: subw a4, a4, t0 -; RV64IM-NEXT: subw a1, a1, a7 +; RV64IM-NEXT: sub a5, a5, t1 +; RV64IM-NEXT: sub a2, a2, a6 +; RV64IM-NEXT: sub a4, a4, t0 +; RV64IM-NEXT: sub a1, a1, a7 ; RV64IM-NEXT: add a3, a3, a5 ; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a3, 2(a0) @@ -300,10 +300,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: mul t0, t0, a6 ; RV64IM-NEXT: mul t1, t1, a6 ; RV64IM-NEXT: mul a2, a2, a6 -; RV64IM-NEXT: subw a3, a3, a7 -; RV64IM-NEXT: subw a4, a4, t0 -; RV64IM-NEXT: subw a5, a5, t1 -; RV64IM-NEXT: subw a1, a1, a2 +; RV64IM-NEXT: sub a3, a3, a7 +; RV64IM-NEXT: sub a4, a4, t0 +; RV64IM-NEXT: sub a5, a5, t1 +; RV64IM-NEXT: sub a1, a1, a2 ; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: sh a4, 2(a0) ; RV64IM-NEXT: sh a5, 4(a0) @@ -508,10 +508,10 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; RV64IM-NEXT: add a1, a1, t1 ; RV64IM-NEXT: add a3, a3, t0 ; RV64IM-NEXT: add a4, a4, a7 -; RV64IM-NEXT: subw a2, a2, a6 -; RV64IM-NEXT: subw a1, a1, t4 -; RV64IM-NEXT: subw a3, a3, t3 -; RV64IM-NEXT: subw a4, a4, t2 +; RV64IM-NEXT: sub a2, a2, a6 +; RV64IM-NEXT: sub a1, a1, t4 +; RV64IM-NEXT: sub a3, a3, t3 +; RV64IM-NEXT: sub a4, a4, t2 ; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a1, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) @@ -622,7 +622,7 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; RV64IM-NEXT: andi a4, a4, 7 ; RV64IM-NEXT: mulhu a5, a1, a5 ; RV64IM-NEXT: mul a5, a5, a6 -; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: sub a1, a1, a5 ; RV64IM-NEXT: sh a2, 0(a0) ; RV64IM-NEXT: sh a3, 2(a0) ; RV64IM-NEXT: sh a4, 4(a0) @@ -757,9 +757,9 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; RV64IM-NEXT: addi a7, a7, 1327 ; RV64IM-NEXT: mulhu a5, a1, a5 ; RV64IM-NEXT: mul a5, a5, a7 -; RV64IM-NEXT: subw a2, a2, a4 -; RV64IM-NEXT: subw a3, a3, a6 -; RV64IM-NEXT: subw a1, a1, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sub a3, a3, a6 +; RV64IM-NEXT: sub a1, a1, a5 ; RV64IM-NEXT: sh zero, 0(a0) ; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: sh a3, 4(a0) diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index 32753ca..cd7f30d 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -716,92 +716,101 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: lbu a5, 8(a0) +; RV32I-NEXT: lbu a6, 9(a0) +; RV32I-NEXT: lbu t3, 10(a0) +; RV32I-NEXT: lbu t4, 11(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a4, a4, a3 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a3, t0, a7 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: lbu t0, 10(a0) -; RV32I-NEXT: lbu t3, 11(a0) ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: or a7, t3, t0 -; RV32I-NEXT: lbu t0, 12(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 14(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu a0, 0(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or a6, t1, a6 +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu t1, 1(a1) +; RV32I-NEXT: lbu t2, 0(a1) +; RV32I-NEXT: lbu t4, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 ; RV32I-NEXT: sw zero, 16(sp) ; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a1, t2, t0 -; RV32I-NEXT: mv t0, sp -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t2, t4, t3 -; RV32I-NEXT: srli t3, a0, 3 -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: andi a5, a0, 31 -; RV32I-NEXT: andi t3, t3, 12 -; RV32I-NEXT: xori a5, a5, 31 -; RV32I-NEXT: or a3, t1, a3 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: or a1, t2, a1 -; RV32I-NEXT: add t0, t0, t3 -; RV32I-NEXT: sw a4, 0(sp) -; RV32I-NEXT: sw a3, 4(sp) -; RV32I-NEXT: sw a6, 8(sp) -; RV32I-NEXT: sw a1, 12(sp) -; RV32I-NEXT: lw a1, 4(t0) -; RV32I-NEXT: lw a3, 8(t0) -; RV32I-NEXT: lw a4, 0(t0) -; RV32I-NEXT: lw a6, 12(t0) -; RV32I-NEXT: srl a7, a1, a0 -; RV32I-NEXT: slli t0, a3, 1 -; RV32I-NEXT: srl a4, a4, a0 -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: srl a3, a3, a0 -; RV32I-NEXT: slli t1, a6, 1 -; RV32I-NEXT: srl a0, a6, a0 -; RV32I-NEXT: sll a6, t0, a5 -; RV32I-NEXT: sll a1, a1, a5 -; RV32I-NEXT: sll a5, t1, a5 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: mv t2, sp +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, t0, a7 +; RV32I-NEXT: or a5, t3, a5 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: srli a0, a1, 3 +; RV32I-NEXT: andi a3, a1, 31 +; RV32I-NEXT: andi a0, a0, 12 +; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: add a0, t2, a0 +; RV32I-NEXT: lw a4, 4(a0) +; RV32I-NEXT: lw a5, 8(a0) +; RV32I-NEXT: lw a6, 0(a0) +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: srl a7, a4, a1 +; RV32I-NEXT: slli t0, a5, 1 +; RV32I-NEXT: srl a6, a6, a1 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: srl a5, a5, a1 +; RV32I-NEXT: slli t1, a0, 1 +; RV32I-NEXT: srl a0, a0, a1 +; RV32I-NEXT: sll a1, t0, a3 +; RV32I-NEXT: sll a4, a4, a3 +; RV32I-NEXT: sll a3, t1, a3 ; RV32I-NEXT: srli t0, a0, 16 ; RV32I-NEXT: srli t1, a0, 24 ; RV32I-NEXT: srli t2, a0, 8 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: or a1, a4, a1 -; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: or a1, a7, a1 +; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) ; RV32I-NEXT: sb t1, 15(a2) ; RV32I-NEXT: srli a0, a3, 16 -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 8 -; RV32I-NEXT: srli a7, a1, 16 -; RV32I-NEXT: srli t0, a1, 24 -; RV32I-NEXT: srli t1, a1, 8 -; RV32I-NEXT: srli t2, a6, 16 -; RV32I-NEXT: srli t3, a6, 24 +; RV32I-NEXT: srli a5, a3, 24 +; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a4, 16 +; RV32I-NEXT: srli t0, a4, 24 +; RV32I-NEXT: srli t1, a4, 8 +; RV32I-NEXT: srli t2, a1, 16 +; RV32I-NEXT: srli t3, a1, 24 ; RV32I-NEXT: sb a3, 8(a2) -; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: sb a6, 9(a2) ; RV32I-NEXT: sb a0, 10(a2) -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a0, a6, 8 -; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a0, a1, 8 +; RV32I-NEXT: sb a4, 0(a2) ; RV32I-NEXT: sb t1, 1(a2) ; RV32I-NEXT: sb a7, 2(a2) ; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: sb a1, 4(a2) ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: sb t2, 6(a2) ; RV32I-NEXT: sb t3, 7(a2) @@ -943,93 +952,102 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: lbu a5, 8(a0) +; RV32I-NEXT: lbu a6, 9(a0) +; RV32I-NEXT: lbu t3, 10(a0) +; RV32I-NEXT: lbu t4, 11(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a4, a4, a3 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a3, t0, a7 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: lbu t0, 10(a0) -; RV32I-NEXT: lbu t3, 11(a0) ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: or a7, t3, t0 -; RV32I-NEXT: lbu t0, 12(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 14(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu a0, 0(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 12(a0) +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t3, t4, t3 +; RV32I-NEXT: or a6, t1, a6 +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu t1, 1(a1) +; RV32I-NEXT: lbu t2, 0(a1) +; RV32I-NEXT: lbu t4, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a1, t2, t0 -; RV32I-NEXT: addi t0, sp, 16 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or t2, t4, t3 -; RV32I-NEXT: srli t3, a0, 3 -; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: andi a5, a0, 31 -; RV32I-NEXT: andi t3, t3, 12 -; RV32I-NEXT: or a3, t1, a3 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: or a1, t2, a1 -; RV32I-NEXT: sub a7, t0, t3 -; RV32I-NEXT: sw a4, 16(sp) -; RV32I-NEXT: sw a3, 20(sp) -; RV32I-NEXT: sw a6, 24(sp) -; RV32I-NEXT: sw a1, 28(sp) -; RV32I-NEXT: lw a1, 0(a7) -; RV32I-NEXT: lw a3, 4(a7) -; RV32I-NEXT: lw a4, 8(a7) -; RV32I-NEXT: lw a6, 12(a7) -; RV32I-NEXT: xori a5, a5, 31 -; RV32I-NEXT: sll a7, a3, a0 -; RV32I-NEXT: srli t0, a1, 1 -; RV32I-NEXT: sll a6, a6, a0 -; RV32I-NEXT: srli t1, a4, 1 -; RV32I-NEXT: sll a4, a4, a0 -; RV32I-NEXT: srli a3, a3, 1 -; RV32I-NEXT: sll a0, a1, a0 -; RV32I-NEXT: srl a1, t0, a5 -; RV32I-NEXT: srl t0, t1, a5 -; RV32I-NEXT: srl a3, a3, a5 -; RV32I-NEXT: srli a5, a0, 16 -; RV32I-NEXT: srli t1, a0, 24 -; RV32I-NEXT: srli t2, a0, 8 -; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or a6, a6, t0 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: addi t2, sp, 16 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: or a4, t0, a7 +; RV32I-NEXT: or a5, t3, a5 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: sw a4, 20(sp) +; RV32I-NEXT: sw a5, 24(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: srli a0, a1, 3 +; RV32I-NEXT: andi a3, a1, 31 +; RV32I-NEXT: andi a0, a0, 12 +; RV32I-NEXT: sub a0, t2, a0 +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a5, 4(a0) +; RV32I-NEXT: lw a6, 8(a0) +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: sll a7, a5, a1 +; RV32I-NEXT: srli t0, a4, 1 +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: srli t1, a6, 1 +; RV32I-NEXT: sll a6, a6, a1 +; RV32I-NEXT: srli a5, a5, 1 +; RV32I-NEXT: sll a1, a4, a1 +; RV32I-NEXT: srl a4, t0, a3 +; RV32I-NEXT: srl t0, t1, a3 +; RV32I-NEXT: srl a3, a5, a3 +; RV32I-NEXT: srli a5, a1, 16 +; RV32I-NEXT: srli t1, a1, 24 +; RV32I-NEXT: srli t2, a1, 8 +; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb t2, 1(a2) ; RV32I-NEXT: sb a5, 2(a2) ; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: srli a0, a3, 16 -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: srli a5, a3, 8 -; RV32I-NEXT: srli a7, a6, 16 -; RV32I-NEXT: srli t0, a6, 24 -; RV32I-NEXT: srli t1, a6, 8 -; RV32I-NEXT: srli t2, a1, 16 -; RV32I-NEXT: srli t3, a1, 24 +; RV32I-NEXT: srli a1, a3, 16 +; RV32I-NEXT: srli a5, a3, 24 +; RV32I-NEXT: srli a6, a3, 8 +; RV32I-NEXT: srli a7, a0, 16 +; RV32I-NEXT: srli t0, a0, 24 +; RV32I-NEXT: srli t1, a0, 8 +; RV32I-NEXT: srli t2, a4, 16 +; RV32I-NEXT: srli t3, a4, 24 ; RV32I-NEXT: sb a3, 8(a2) -; RV32I-NEXT: sb a5, 9(a2) -; RV32I-NEXT: sb a0, 10(a2) -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a0, a1, 8 -; RV32I-NEXT: sb a6, 12(a2) +; RV32I-NEXT: sb a6, 9(a2) +; RV32I-NEXT: sb a1, 10(a2) +; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a1, a4, 8 +; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t1, 13(a2) ; RV32I-NEXT: sb a7, 14(a2) ; RV32I-NEXT: sb t0, 15(a2) -; RV32I-NEXT: sb a1, 4(a2) -; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: sb a1, 5(a2) ; RV32I-NEXT: sb t2, 6(a2) ; RV32I-NEXT: sb t3, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 @@ -1168,73 +1186,82 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 8(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t4, 10(a0) +; RV32I-NEXT: lbu t5, 11(a0) ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: lbu a6, 8(a0) -; RV32I-NEXT: lbu a7, 9(a0) -; RV32I-NEXT: lbu t0, 10(a0) -; RV32I-NEXT: lbu t3, 11(a0) ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: or a7, t3, t0 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 ; RV32I-NEXT: lbu t0, 12(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 14(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu a0, 0(a1) -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: or a1, t2, t0 -; RV32I-NEXT: mv t0, sp -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: srli a4, a0, 3 -; RV32I-NEXT: or a5, t1, a5 -; RV32I-NEXT: andi t1, a0, 31 -; RV32I-NEXT: or t2, t4, t3 -; RV32I-NEXT: srai t3, t4, 31 -; RV32I-NEXT: andi a4, a4, 12 -; RV32I-NEXT: xori t1, t1, 31 +; RV32I-NEXT: lbu t1, 13(a0) +; RV32I-NEXT: lbu t2, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or a4, t3, a4 +; RV32I-NEXT: or t3, t5, t4 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: lbu t1, 1(a1) +; RV32I-NEXT: lbu t4, 0(a1) +; RV32I-NEXT: lbu t5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t4 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t5 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: mv a5, sp +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t2, a0, t2 +; RV32I-NEXT: srai a0, a0, 31 ; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: or a1, t2, a1 -; RV32I-NEXT: sw t3, 16(sp) -; RV32I-NEXT: sw t3, 20(sp) -; RV32I-NEXT: sw t3, 24(sp) -; RV32I-NEXT: sw t3, 28(sp) -; RV32I-NEXT: add a4, t0, a4 +; RV32I-NEXT: or a4, t3, a4 +; RV32I-NEXT: or a7, t2, t0 +; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: sw a0, 16(sp) +; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: sw a0, 24(sp) +; RV32I-NEXT: sw a0, 28(sp) ; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a5, 4(sp) -; RV32I-NEXT: sw a6, 8(sp) -; RV32I-NEXT: sw a1, 12(sp) -; RV32I-NEXT: lw a1, 4(a4) -; RV32I-NEXT: lw a3, 8(a4) -; RV32I-NEXT: lw a5, 0(a4) -; RV32I-NEXT: lw a4, 12(a4) -; RV32I-NEXT: srl a6, a1, a0 -; RV32I-NEXT: slli a7, a3, 1 -; RV32I-NEXT: srl a5, a5, a0 -; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: srl a3, a3, a0 -; RV32I-NEXT: slli t0, a4, 1 -; RV32I-NEXT: sra a0, a4, a0 -; RV32I-NEXT: sll a4, a7, t1 -; RV32I-NEXT: sll a1, a1, t1 -; RV32I-NEXT: sll a7, t0, t1 +; RV32I-NEXT: sw a6, 4(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a7, 12(sp) +; RV32I-NEXT: srli a0, a1, 3 +; RV32I-NEXT: andi a3, a1, 31 +; RV32I-NEXT: andi a0, a0, 12 +; RV32I-NEXT: xori a3, a3, 31 +; RV32I-NEXT: add a0, a5, a0 +; RV32I-NEXT: lw a4, 4(a0) +; RV32I-NEXT: lw a5, 8(a0) +; RV32I-NEXT: lw a6, 0(a0) +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: srl a7, a4, a1 +; RV32I-NEXT: slli t0, a5, 1 +; RV32I-NEXT: srl a6, a6, a1 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: srl a5, a5, a1 +; RV32I-NEXT: slli t1, a0, 1 +; RV32I-NEXT: sra a0, a0, a1 +; RV32I-NEXT: sll a1, t0, a3 +; RV32I-NEXT: sll a4, a4, a3 +; RV32I-NEXT: sll a3, t1, a3 ; RV32I-NEXT: srli t0, a0, 16 ; RV32I-NEXT: srli t1, a0, 24 ; RV32I-NEXT: srli t2, a0, 8 +; RV32I-NEXT: or a1, a7, a1 ; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: or a1, a5, a1 -; RV32I-NEXT: or a3, a3, a7 +; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) @@ -1242,21 +1269,21 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli a0, a3, 16 ; RV32I-NEXT: srli a5, a3, 24 ; RV32I-NEXT: srli a6, a3, 8 -; RV32I-NEXT: srli a7, a1, 16 -; RV32I-NEXT: srli t0, a1, 24 -; RV32I-NEXT: srli t1, a1, 8 -; RV32I-NEXT: srli t2, a4, 16 -; RV32I-NEXT: srli t3, a4, 24 +; RV32I-NEXT: srli a7, a4, 16 +; RV32I-NEXT: srli t0, a4, 24 +; RV32I-NEXT: srli t1, a4, 8 +; RV32I-NEXT: srli t2, a1, 16 +; RV32I-NEXT: srli t3, a1, 24 ; RV32I-NEXT: sb a3, 8(a2) ; RV32I-NEXT: sb a6, 9(a2) ; RV32I-NEXT: sb a0, 10(a2) ; RV32I-NEXT: sb a5, 11(a2) -; RV32I-NEXT: srli a0, a4, 8 -; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: srli a0, a1, 8 +; RV32I-NEXT: sb a4, 0(a2) ; RV32I-NEXT: sb t1, 1(a2) ; RV32I-NEXT: sb a7, 2(a2) ; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: sb a1, 4(a2) ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: sb t2, 6(a2) ; RV32I-NEXT: sb t3, 7(a2) @@ -1272,17 +1299,19 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -144 -; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -1299,122 +1328,143 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s1, 13(a0) ; RV64I-NEXT: lbu s2, 14(a0) ; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: lbu s4, 16(a0) ; RV64I-NEXT: lbu s5, 17(a0) ; RV64I-NEXT: lbu s6, 18(a0) ; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli s8, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: or a6, a6, s8 +; RV64I-NEXT: or a3, t0, a7 +; RV64I-NEXT: or a4, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t5, t5, 16 ; RV64I-NEXT: slli t6, t6, 24 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: lbu t5, 20(a0) -; RV64I-NEXT: lbu t6, 21(a0) -; RV64I-NEXT: lbu s8, 22(a0) -; RV64I-NEXT: lbu s9, 23(a0) ; RV64I-NEXT: slli s1, s1, 8 ; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s6, s6, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or t1, s1, s0 -; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: slli s9, s9, 8 ; RV64I-NEXT: or t3, s5, s4 ; RV64I-NEXT: or t4, s7, s6 -; RV64I-NEXT: lbu s0, 24(a0) -; RV64I-NEXT: lbu s1, 25(a0) -; RV64I-NEXT: lbu s2, 26(a0) -; RV64I-NEXT: lbu s3, 27(a0) -; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s8, s8, 16 -; RV64I-NEXT: slli s9, s9, 24 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: or t6, s9, s8 -; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: lbu s1, 28(a0) +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) ; RV64I-NEXT: lbu s4, 29(a0) ; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu s6, 31(a0) -; RV64I-NEXT: lbu a0, 0(a1) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) +; RV64I-NEXT: lbu s7, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, s7 ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) -; RV64I-NEXT: slli s2, s2, 16 -; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: or a1, s3, s2 -; RV64I-NEXT: mv s2, sp -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: slli s5, s5, 16 -; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: or s1, s4, s1 -; RV64I-NEXT: srli s3, a0, 3 -; RV64I-NEXT: or s4, s6, s5 -; RV64I-NEXT: andi s5, a0, 63 -; RV64I-NEXT: andi s3, s3, 24 -; RV64I-NEXT: xori s5, s5, 63 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: or a1, a1, s0 -; RV64I-NEXT: or t1, s4, s1 -; RV64I-NEXT: add s2, s2, s3 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: slli t0, t0, 32 -; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: mv a6, sp ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a1, t1, a1 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: or t0, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t1, s0, t6 +; RV64I-NEXT: or t2, s5, s1 +; RV64I-NEXT: or t3, s3, s2 +; RV64I-NEXT: or a1, a1, s4 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t2, t2, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a3, a3, a5 +; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a5, t2, t1 +; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: sd a3, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) -; RV64I-NEXT: sd a1, 24(sp) -; RV64I-NEXT: ld a1, 8(s2) -; RV64I-NEXT: ld a3, 16(s2) -; RV64I-NEXT: ld a4, 0(s2) -; RV64I-NEXT: ld a5, 24(s2) -; RV64I-NEXT: srl a6, a1, a0 -; RV64I-NEXT: slli a7, a3, 1 -; RV64I-NEXT: srl a4, a4, a0 -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: srl a3, a3, a0 +; RV64I-NEXT: sd a0, 16(sp) +; RV64I-NEXT: sd a5, 24(sp) +; RV64I-NEXT: srli a0, a1, 3 +; RV64I-NEXT: andi a3, a1, 63 +; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: xori a3, a3, 63 +; RV64I-NEXT: add a0, a6, a0 +; RV64I-NEXT: ld a4, 8(a0) +; RV64I-NEXT: ld a5, 16(a0) +; RV64I-NEXT: ld a6, 0(a0) +; RV64I-NEXT: ld a0, 24(a0) +; RV64I-NEXT: srl a7, a4, a1 ; RV64I-NEXT: slli t0, a5, 1 -; RV64I-NEXT: srl a5, a5, a0 -; RV64I-NEXT: sll a0, a7, s5 -; RV64I-NEXT: sll a1, a1, s5 -; RV64I-NEXT: sll a7, t0, s5 -; RV64I-NEXT: srli t0, a5, 56 -; RV64I-NEXT: srli t1, a5, 48 -; RV64I-NEXT: srli t2, a5, 40 -; RV64I-NEXT: srli t3, a5, 32 -; RV64I-NEXT: srli t4, a5, 24 -; RV64I-NEXT: srli t5, a5, 16 -; RV64I-NEXT: srli t6, a5, 8 -; RV64I-NEXT: or a0, a6, a0 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: or a3, a3, a7 +; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: srl a5, a5, a1 +; RV64I-NEXT: slli t1, a0, 1 +; RV64I-NEXT: srl t2, a0, a1 +; RV64I-NEXT: sll a0, t0, a3 +; RV64I-NEXT: sll a1, a4, a3 +; RV64I-NEXT: sll a3, t1, a3 +; RV64I-NEXT: srli a4, t2, 56 +; RV64I-NEXT: srli t0, t2, 48 +; RV64I-NEXT: srli t1, t2, 40 +; RV64I-NEXT: srli t3, t2, 32 +; RV64I-NEXT: srli t4, t2, 24 +; RV64I-NEXT: srli t5, t2, 16 +; RV64I-NEXT: srli t6, t2, 8 +; RV64I-NEXT: or a0, a7, a0 +; RV64I-NEXT: or a1, a6, a1 +; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: sb t3, 28(a2) -; RV64I-NEXT: sb t2, 29(a2) -; RV64I-NEXT: sb t1, 30(a2) -; RV64I-NEXT: sb t0, 31(a2) -; RV64I-NEXT: sb a5, 24(a2) +; RV64I-NEXT: sb t1, 29(a2) +; RV64I-NEXT: sb t0, 30(a2) +; RV64I-NEXT: sb a4, 31(a2) +; RV64I-NEXT: sb t2, 24(a2) ; RV64I-NEXT: sb t6, 25(a2) ; RV64I-NEXT: sb t5, 26(a2) ; RV64I-NEXT: sb t4, 27(a2) @@ -1463,17 +1513,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a1, 9(a2) ; RV64I-NEXT: sb a5, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 144 +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_32bytes: @@ -1498,55 +1550,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a7, 3(a0) ; RV32I-NEXT: lbu a5, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t3, 6(a0) -; RV32I-NEXT: lbu t6, 7(a0) -; RV32I-NEXT: lbu s2, 8(a0) -; RV32I-NEXT: lbu s3, 9(a0) -; RV32I-NEXT: lbu s4, 10(a0) -; RV32I-NEXT: lbu s5, 11(a0) -; RV32I-NEXT: lbu s7, 12(a0) -; RV32I-NEXT: lbu s8, 13(a0) -; RV32I-NEXT: lbu s9, 14(a0) -; RV32I-NEXT: lbu s10, 15(a0) -; RV32I-NEXT: lbu s11, 16(a0) -; RV32I-NEXT: lbu ra, 17(a0) -; RV32I-NEXT: lbu t4, 18(a0) -; RV32I-NEXT: lbu s0, 19(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s2, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 ; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: or a4, a7, a6 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: lbu t5, 22(a0) -; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu ra, 22(a0) +; RV32I-NEXT: lbu a3, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: lbu s1, 24(a0) +; RV32I-NEXT: lbu s3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) +; RV32I-NEXT: slli s2, s2, 8 ; RV32I-NEXT: slli s4, s4, 16 ; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t6, t3 -; RV32I-NEXT: or a7, s3, s2 -; RV32I-NEXT: or t0, s5, s4 -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu s5, 25(a0) -; RV32I-NEXT: lbu s6, 26(a0) -; RV32I-NEXT: lbu t6, 27(a0) -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: slli s9, s9, 16 -; RV32I-NEXT: slli s10, s10, 24 -; RV32I-NEXT: slli ra, ra, 8 -; RV32I-NEXT: or s7, s8, s7 -; RV32I-NEXT: or s2, s10, s9 -; RV32I-NEXT: or s3, ra, s11 -; RV32I-NEXT: lbu s4, 28(a0) -; RV32I-NEXT: lbu s8, 29(a0) -; RV32I-NEXT: lbu s9, 30(a0) -; RV32I-NEXT: lbu s10, 31(a0) -; RV32I-NEXT: lbu a0, 0(a1) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or t3, s7, s6 +; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu s6, 31(a0) +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: or s2, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s7, 1(a1) +; RV32I-NEXT: lbu s8, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) ; RV32I-NEXT: sw zero, 64(sp) @@ -1555,90 +1619,89 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw zero, 44(sp) ; RV32I-NEXT: sw zero, 48(sp) ; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: addi s3, sp, 8 ; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or t4, s0, t4 -; RV32I-NEXT: addi s0, sp, 8 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: slli s9, s9, 16 -; RV32I-NEXT: slli s10, s10, 24 -; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s4, t6 +; RV32I-NEXT: or t6, s6, s5 +; RV32I-NEXT: or a3, s7, a3 +; RV32I-NEXT: or a1, a1, s8 +; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s4 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or t0, a0, t3 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, t4, s1 +; RV32I-NEXT: or t3, t6, t5 +; RV32I-NEXT: or a0, a1, a3 +; RV32I-NEXT: sw t0, 24(sp) +; RV32I-NEXT: sw t1, 28(sp) +; RV32I-NEXT: sw t2, 32(sp) +; RV32I-NEXT: sw t3, 36(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: sw a7, 20(sp) ; RV32I-NEXT: srli a1, a0, 3 -; RV32I-NEXT: or t2, s1, t5 -; RV32I-NEXT: andi t5, a0, 31 -; RV32I-NEXT: or t3, s5, t3 -; RV32I-NEXT: or t6, t6, s6 -; RV32I-NEXT: or s1, s8, s4 -; RV32I-NEXT: or s4, s10, s9 -; RV32I-NEXT: andi s5, a1, 28 -; RV32I-NEXT: xori a1, t5, 31 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, s2, s7 -; RV32I-NEXT: or a7, t4, s3 -; RV32I-NEXT: or t0, t2, t1 -; RV32I-NEXT: or t1, t6, t3 -; RV32I-NEXT: or t2, s4, s1 -; RV32I-NEXT: add s0, s0, s5 -; RV32I-NEXT: sw a7, 24(sp) -; RV32I-NEXT: sw t0, 28(sp) -; RV32I-NEXT: sw t1, 32(sp) -; RV32I-NEXT: sw t2, 36(sp) -; RV32I-NEXT: sw a3, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) -; RV32I-NEXT: sw a5, 16(sp) -; RV32I-NEXT: sw a6, 20(sp) -; RV32I-NEXT: lw a3, 0(s0) -; RV32I-NEXT: lw a4, 4(s0) -; RV32I-NEXT: lw a5, 8(s0) -; RV32I-NEXT: lw a6, 12(s0) -; RV32I-NEXT: lw a7, 16(s0) -; RV32I-NEXT: lw t0, 20(s0) -; RV32I-NEXT: lw t1, 24(s0) -; RV32I-NEXT: lw t2, 28(s0) -; RV32I-NEXT: srl t3, a4, a0 -; RV32I-NEXT: slli t4, a5, 1 +; RV32I-NEXT: andi a3, a0, 31 +; RV32I-NEXT: andi a4, a1, 28 +; RV32I-NEXT: xori a1, a3, 31 +; RV32I-NEXT: add a4, s3, a4 +; RV32I-NEXT: lw a3, 0(a4) +; RV32I-NEXT: lw a5, 4(a4) +; RV32I-NEXT: lw a6, 8(a4) +; RV32I-NEXT: lw a7, 12(a4) +; RV32I-NEXT: lw t0, 16(a4) +; RV32I-NEXT: lw t1, 20(a4) +; RV32I-NEXT: lw t2, 24(a4) +; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: srl t3, a5, a0 +; RV32I-NEXT: slli t4, a6, 1 ; RV32I-NEXT: srl a3, a3, a0 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: srl t5, a6, a0 -; RV32I-NEXT: slli t6, a7, 1 -; RV32I-NEXT: srl a5, a5, a0 -; RV32I-NEXT: slli a6, a6, 1 -; RV32I-NEXT: srl s0, t0, a0 -; RV32I-NEXT: slli s1, t1, 1 -; RV32I-NEXT: srl a7, a7, a0 -; RV32I-NEXT: slli t0, t0, 1 -; RV32I-NEXT: srl t1, t1, a0 -; RV32I-NEXT: slli s2, t2, 1 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl t5, a7, a0 +; RV32I-NEXT: slli t6, t0, 1 +; RV32I-NEXT: srl a6, a6, a0 +; RV32I-NEXT: slli a7, a7, 1 +; RV32I-NEXT: srl s0, t1, a0 +; RV32I-NEXT: slli s1, t2, 1 +; RV32I-NEXT: srl t0, t0, a0 +; RV32I-NEXT: slli t1, t1, 1 ; RV32I-NEXT: srl t2, t2, a0 +; RV32I-NEXT: slli s2, a4, 1 +; RV32I-NEXT: srl s3, a4, a0 ; RV32I-NEXT: sll a0, t4, a1 -; RV32I-NEXT: sll a4, a4, a1 -; RV32I-NEXT: sll t4, t6, a1 -; RV32I-NEXT: sll a6, a6, a1 -; RV32I-NEXT: sll t6, s1, a1 -; RV32I-NEXT: sll t0, t0, a1 -; RV32I-NEXT: sll s1, s2, a1 -; RV32I-NEXT: srli s2, t2, 24 -; RV32I-NEXT: srli s3, t2, 16 -; RV32I-NEXT: srli s4, t2, 8 +; RV32I-NEXT: sll a4, a5, a1 +; RV32I-NEXT: sll a5, t6, a1 +; RV32I-NEXT: sll a7, a7, a1 +; RV32I-NEXT: sll t4, s1, a1 +; RV32I-NEXT: sll t1, t1, a1 +; RV32I-NEXT: sll t6, s2, a1 +; RV32I-NEXT: srli s1, s3, 24 +; RV32I-NEXT: srli s2, s3, 16 +; RV32I-NEXT: srli s4, s3, 8 ; RV32I-NEXT: or a0, t3, a0 ; RV32I-NEXT: or a1, a3, a4 -; RV32I-NEXT: or a3, t5, t4 -; RV32I-NEXT: or a4, a5, a6 -; RV32I-NEXT: or a5, s0, t6 -; RV32I-NEXT: or a6, a7, t0 -; RV32I-NEXT: or a7, t1, s1 -; RV32I-NEXT: sb t2, 28(a2) +; RV32I-NEXT: or a3, t5, a5 +; RV32I-NEXT: or a4, a6, a7 +; RV32I-NEXT: or a5, s0, t4 +; RV32I-NEXT: or a6, t0, t1 +; RV32I-NEXT: or a7, t2, t6 +; RV32I-NEXT: sb s3, 28(a2) ; RV32I-NEXT: sb s4, 29(a2) -; RV32I-NEXT: sb s3, 30(a2) -; RV32I-NEXT: sb s2, 31(a2) +; RV32I-NEXT: sb s2, 30(a2) +; RV32I-NEXT: sb s1, 31(a2) ; RV32I-NEXT: srli t0, a7, 24 ; RV32I-NEXT: srli t1, a7, 16 ; RV32I-NEXT: srli t2, a7, 8 @@ -1712,17 +1775,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -144 -; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -1739,125 +1804,146 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s1, 13(a0) ; RV64I-NEXT: lbu s2, 14(a0) ; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: lbu s4, 16(a0) ; RV64I-NEXT: lbu s5, 17(a0) ; RV64I-NEXT: lbu s6, 18(a0) ; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli s8, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: or a6, a6, s8 +; RV64I-NEXT: or a3, t0, a7 +; RV64I-NEXT: or a4, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t5, t5, 16 ; RV64I-NEXT: slli t6, t6, 24 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: lbu t5, 20(a0) -; RV64I-NEXT: lbu t6, 21(a0) -; RV64I-NEXT: lbu s8, 22(a0) -; RV64I-NEXT: lbu s9, 23(a0) ; RV64I-NEXT: slli s1, s1, 8 ; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s6, s6, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or t1, s1, s0 -; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: slli s9, s9, 8 ; RV64I-NEXT: or t3, s5, s4 ; RV64I-NEXT: or t4, s7, s6 -; RV64I-NEXT: lbu s0, 24(a0) -; RV64I-NEXT: lbu s1, 25(a0) -; RV64I-NEXT: lbu s2, 26(a0) -; RV64I-NEXT: lbu s3, 27(a0) -; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s8, s8, 16 -; RV64I-NEXT: slli s9, s9, 24 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: or t6, s9, s8 -; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: lbu s1, 28(a0) +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) ; RV64I-NEXT: lbu s4, 29(a0) ; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu s6, 31(a0) -; RV64I-NEXT: lbu a0, 0(a1) +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) +; RV64I-NEXT: lbu s7, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, s7 ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: slli s2, s2, 16 -; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: or a1, s3, s2 -; RV64I-NEXT: addi s2, sp, 32 -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: slli s5, s5, 16 -; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: or s1, s4, s1 -; RV64I-NEXT: srli s3, a0, 3 -; RV64I-NEXT: or s4, s6, s5 -; RV64I-NEXT: andi s5, a0, 63 -; RV64I-NEXT: andi s3, s3, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: or a1, a1, s0 -; RV64I-NEXT: or t1, s4, s1 -; RV64I-NEXT: sub t2, s2, s3 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: slli t0, t0, 32 -; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: addi a6, sp, 32 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a1, t1, a1 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: or a7, t2, t1 +; RV64I-NEXT: or t0, t4, t3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t1, s0, t6 +; RV64I-NEXT: or t2, s5, s1 +; RV64I-NEXT: or t3, s3, s2 +; RV64I-NEXT: or a1, a1, s4 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t2, t2, 32 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a3, a3, a5 +; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a5, t2, t1 +; RV64I-NEXT: or a1, a1, t3 ; RV64I-NEXT: sd a3, 32(sp) ; RV64I-NEXT: sd a4, 40(sp) -; RV64I-NEXT: sd a5, 48(sp) -; RV64I-NEXT: sd a1, 56(sp) -; RV64I-NEXT: ld a1, 0(t2) -; RV64I-NEXT: ld a3, 8(t2) -; RV64I-NEXT: ld a4, 16(t2) -; RV64I-NEXT: ld a5, 24(t2) -; RV64I-NEXT: xori a6, s5, 63 -; RV64I-NEXT: sll a7, a3, a0 -; RV64I-NEXT: srli t0, a1, 1 -; RV64I-NEXT: sll a5, a5, a0 -; RV64I-NEXT: srli t1, a4, 1 -; RV64I-NEXT: sll a4, a4, a0 -; RV64I-NEXT: srli a3, a3, 1 -; RV64I-NEXT: sll t2, a1, a0 -; RV64I-NEXT: srl a0, t0, a6 -; RV64I-NEXT: srl a1, t1, a6 -; RV64I-NEXT: srl a3, a3, a6 -; RV64I-NEXT: srli a6, t2, 56 -; RV64I-NEXT: srli t0, t2, 48 -; RV64I-NEXT: srli t1, t2, 40 -; RV64I-NEXT: srli t3, t2, 32 -; RV64I-NEXT: srli t4, t2, 24 -; RV64I-NEXT: srli t5, t2, 16 -; RV64I-NEXT: srli t6, t2, 8 -; RV64I-NEXT: or a0, a7, a0 -; RV64I-NEXT: or a1, a5, a1 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: sb t3, 4(a2) -; RV64I-NEXT: sb t1, 5(a2) -; RV64I-NEXT: sb t0, 6(a2) -; RV64I-NEXT: sb a6, 7(a2) -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb t6, 1(a2) -; RV64I-NEXT: sb t5, 2(a2) -; RV64I-NEXT: sb t4, 3(a2) +; RV64I-NEXT: sd a0, 48(sp) +; RV64I-NEXT: sd a5, 56(sp) +; RV64I-NEXT: srli a0, a1, 3 +; RV64I-NEXT: andi a3, a1, 63 +; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: sub a0, a6, a0 +; RV64I-NEXT: ld a4, 0(a0) +; RV64I-NEXT: ld a5, 8(a0) +; RV64I-NEXT: ld a6, 16(a0) +; RV64I-NEXT: ld a0, 24(a0) +; RV64I-NEXT: xori a3, a3, 63 +; RV64I-NEXT: sll a7, a5, a1 +; RV64I-NEXT: srli t0, a4, 1 +; RV64I-NEXT: sll t1, a0, a1 +; RV64I-NEXT: srli a0, a6, 1 +; RV64I-NEXT: sll a6, a6, a1 +; RV64I-NEXT: srli a5, a5, 1 +; RV64I-NEXT: sll a4, a4, a1 +; RV64I-NEXT: srl a1, t0, a3 +; RV64I-NEXT: srl t0, a0, a3 +; RV64I-NEXT: srl a3, a5, a3 +; RV64I-NEXT: srli a5, a4, 56 +; RV64I-NEXT: srli t2, a4, 48 +; RV64I-NEXT: srli t3, a4, 40 +; RV64I-NEXT: srli t4, a4, 32 +; RV64I-NEXT: srli t5, a4, 24 +; RV64I-NEXT: srli t6, a4, 16 +; RV64I-NEXT: srli s0, a4, 8 +; RV64I-NEXT: or a0, a7, a1 +; RV64I-NEXT: or a1, t1, t0 +; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: sb t4, 4(a2) +; RV64I-NEXT: sb t3, 5(a2) +; RV64I-NEXT: sb t2, 6(a2) +; RV64I-NEXT: sb a5, 7(a2) +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb s0, 1(a2) +; RV64I-NEXT: sb t6, 2(a2) +; RV64I-NEXT: sb t5, 3(a2) ; RV64I-NEXT: srli a4, a3, 56 ; RV64I-NEXT: srli a5, a3, 48 ; RV64I-NEXT: srli a6, a3, 40 @@ -1903,17 +1989,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a1, 9(a2) ; RV64I-NEXT: sb a5, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 144 +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_32bytes: @@ -1938,55 +2026,67 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a7, 3(a0) ; RV32I-NEXT: lbu a5, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t3, 6(a0) -; RV32I-NEXT: lbu t6, 7(a0) -; RV32I-NEXT: lbu s2, 8(a0) -; RV32I-NEXT: lbu s3, 9(a0) -; RV32I-NEXT: lbu s4, 10(a0) -; RV32I-NEXT: lbu s5, 11(a0) -; RV32I-NEXT: lbu s7, 12(a0) -; RV32I-NEXT: lbu s8, 13(a0) -; RV32I-NEXT: lbu s9, 14(a0) -; RV32I-NEXT: lbu s10, 15(a0) -; RV32I-NEXT: lbu s11, 16(a0) -; RV32I-NEXT: lbu ra, 17(a0) -; RV32I-NEXT: lbu t4, 18(a0) -; RV32I-NEXT: lbu s0, 19(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s2, 13(a0) +; RV32I-NEXT: lbu s4, 14(a0) +; RV32I-NEXT: lbu s5, 15(a0) +; RV32I-NEXT: lbu s6, 16(a0) +; RV32I-NEXT: lbu s7, 17(a0) +; RV32I-NEXT: lbu s8, 18(a0) +; RV32I-NEXT: lbu s9, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 ; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: or a4, a7, a6 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: lbu t5, 22(a0) -; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu ra, 22(a0) +; RV32I-NEXT: lbu a3, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: lbu s1, 24(a0) +; RV32I-NEXT: lbu s3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) +; RV32I-NEXT: slli s2, s2, 8 ; RV32I-NEXT: slli s4, s4, 16 ; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t6, t3 -; RV32I-NEXT: or a7, s3, s2 -; RV32I-NEXT: or t0, s5, s4 -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu s5, 25(a0) -; RV32I-NEXT: lbu s6, 26(a0) -; RV32I-NEXT: lbu t6, 27(a0) -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: slli s9, s9, 16 -; RV32I-NEXT: slli s10, s10, 24 -; RV32I-NEXT: slli ra, ra, 8 -; RV32I-NEXT: or s7, s8, s7 -; RV32I-NEXT: or s2, s10, s9 -; RV32I-NEXT: or s3, ra, s11 -; RV32I-NEXT: lbu s4, 28(a0) -; RV32I-NEXT: lbu s8, 29(a0) -; RV32I-NEXT: lbu s9, 30(a0) -; RV32I-NEXT: lbu s10, 31(a0) -; RV32I-NEXT: lbu a0, 0(a1) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: or t3, s7, s6 +; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: lbu s4, 29(a0) +; RV32I-NEXT: lbu s5, 30(a0) +; RV32I-NEXT: lbu s6, 31(a0) +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: slli ra, ra, 16 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a0, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: or s2, a3, ra +; RV32I-NEXT: lbu a3, 0(a1) +; RV32I-NEXT: lbu s7, 1(a1) +; RV32I-NEXT: lbu s8, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) ; RV32I-NEXT: sw zero, 32(sp) @@ -1995,89 +2095,88 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw zero, 12(sp) ; RV32I-NEXT: sw zero, 16(sp) ; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: slli s3, s3, 8 +; RV32I-NEXT: or s1, s3, s1 +; RV32I-NEXT: addi s3, sp, 40 ; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or t4, s0, t4 -; RV32I-NEXT: addi s0, sp, 40 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: slli s8, s8, 8 -; RV32I-NEXT: slli s9, s9, 16 -; RV32I-NEXT: slli s10, s10, 24 -; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli s6, s6, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s4, t6 +; RV32I-NEXT: or t6, s6, s5 +; RV32I-NEXT: or a3, s7, a3 +; RV32I-NEXT: or a1, a1, s8 +; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, s4 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or t0, a0, t3 +; RV32I-NEXT: or t1, s2, s0 +; RV32I-NEXT: or t2, t4, s1 +; RV32I-NEXT: or t3, t6, t5 +; RV32I-NEXT: or a0, a1, a3 +; RV32I-NEXT: sw t0, 56(sp) +; RV32I-NEXT: sw t1, 60(sp) +; RV32I-NEXT: sw t2, 64(sp) +; RV32I-NEXT: sw t3, 68(sp) +; RV32I-NEXT: sw a4, 40(sp) +; RV32I-NEXT: sw a5, 44(sp) +; RV32I-NEXT: sw a6, 48(sp) +; RV32I-NEXT: sw a7, 52(sp) ; RV32I-NEXT: srli a1, a0, 3 -; RV32I-NEXT: or t2, s1, t5 -; RV32I-NEXT: andi t5, a0, 31 -; RV32I-NEXT: or t3, s5, t3 -; RV32I-NEXT: or t6, t6, s6 -; RV32I-NEXT: or s1, s8, s4 -; RV32I-NEXT: or s4, s10, s9 -; RV32I-NEXT: andi s5, a1, 28 -; RV32I-NEXT: xori a1, t5, 31 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, s2, s7 -; RV32I-NEXT: or a7, t4, s3 -; RV32I-NEXT: or t0, t2, t1 -; RV32I-NEXT: or t1, t6, t3 -; RV32I-NEXT: or t2, s4, s1 -; RV32I-NEXT: sub t3, s0, s5 -; RV32I-NEXT: sw a7, 56(sp) -; RV32I-NEXT: sw t0, 60(sp) -; RV32I-NEXT: sw t1, 64(sp) -; RV32I-NEXT: sw t2, 68(sp) -; RV32I-NEXT: sw a3, 40(sp) -; RV32I-NEXT: sw a4, 44(sp) -; RV32I-NEXT: sw a5, 48(sp) -; RV32I-NEXT: sw a6, 52(sp) -; RV32I-NEXT: lw a3, 0(t3) -; RV32I-NEXT: lw a4, 4(t3) -; RV32I-NEXT: lw a5, 8(t3) -; RV32I-NEXT: lw a6, 12(t3) -; RV32I-NEXT: lw a7, 16(t3) -; RV32I-NEXT: lw t0, 20(t3) -; RV32I-NEXT: lw t1, 24(t3) -; RV32I-NEXT: lw t2, 28(t3) -; RV32I-NEXT: sll t3, a4, a0 -; RV32I-NEXT: srli t4, a3, 1 -; RV32I-NEXT: sll t5, a6, a0 -; RV32I-NEXT: srli t6, a5, 1 -; RV32I-NEXT: sll a5, a5, a0 -; RV32I-NEXT: srli a4, a4, 1 -; RV32I-NEXT: sll s0, t0, a0 -; RV32I-NEXT: srli s1, a7, 1 -; RV32I-NEXT: sll a7, a7, a0 -; RV32I-NEXT: srli a6, a6, 1 +; RV32I-NEXT: andi a3, a0, 31 +; RV32I-NEXT: andi a4, a1, 28 +; RV32I-NEXT: xori a1, a3, 31 +; RV32I-NEXT: sub a3, s3, a4 +; RV32I-NEXT: lw a4, 0(a3) +; RV32I-NEXT: lw a5, 4(a3) +; RV32I-NEXT: lw a6, 8(a3) +; RV32I-NEXT: lw a7, 12(a3) +; RV32I-NEXT: lw t0, 16(a3) +; RV32I-NEXT: lw t1, 20(a3) +; RV32I-NEXT: lw t2, 24(a3) +; RV32I-NEXT: lw a3, 28(a3) +; RV32I-NEXT: sll t3, a5, a0 +; RV32I-NEXT: srli t4, a4, 1 +; RV32I-NEXT: sll t5, a7, a0 +; RV32I-NEXT: srli t6, a6, 1 +; RV32I-NEXT: sll a6, a6, a0 +; RV32I-NEXT: srli a5, a5, 1 +; RV32I-NEXT: sll s0, t1, a0 +; RV32I-NEXT: srli s1, t0, 1 +; RV32I-NEXT: sll t0, t0, a0 +; RV32I-NEXT: srli a7, a7, 1 +; RV32I-NEXT: sll s2, a3, a0 +; RV32I-NEXT: srli a3, t2, 1 ; RV32I-NEXT: sll t2, t2, a0 -; RV32I-NEXT: srli s2, t1, 1 -; RV32I-NEXT: sll t1, t1, a0 -; RV32I-NEXT: srli t0, t0, 1 -; RV32I-NEXT: sll s3, a3, a0 +; RV32I-NEXT: srli t1, t1, 1 +; RV32I-NEXT: sll s3, a4, a0 ; RV32I-NEXT: srl a0, t4, a1 -; RV32I-NEXT: srl a3, t6, a1 -; RV32I-NEXT: srl a4, a4, a1 +; RV32I-NEXT: srl a4, t6, a1 +; RV32I-NEXT: srl a5, a5, a1 ; RV32I-NEXT: srl t4, s1, a1 -; RV32I-NEXT: srl a6, a6, a1 -; RV32I-NEXT: srl t6, s2, a1 -; RV32I-NEXT: srl t0, t0, a1 +; RV32I-NEXT: srl a7, a7, a1 +; RV32I-NEXT: srl t6, a3, a1 +; RV32I-NEXT: srl t1, t1, a1 ; RV32I-NEXT: srli s1, s3, 24 -; RV32I-NEXT: srli s2, s3, 16 -; RV32I-NEXT: srli s4, s3, 8 +; RV32I-NEXT: srli s4, s3, 16 +; RV32I-NEXT: srli s5, s3, 8 ; RV32I-NEXT: or a0, t3, a0 -; RV32I-NEXT: or a1, t5, a3 -; RV32I-NEXT: or a3, a5, a4 +; RV32I-NEXT: or a1, t5, a4 +; RV32I-NEXT: or a3, a6, a5 ; RV32I-NEXT: or a4, s0, t4 -; RV32I-NEXT: or a5, a7, a6 -; RV32I-NEXT: or a6, t2, t6 -; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, s2, t6 +; RV32I-NEXT: or a7, t2, t1 ; RV32I-NEXT: sb s3, 0(a2) -; RV32I-NEXT: sb s4, 1(a2) -; RV32I-NEXT: sb s2, 2(a2) +; RV32I-NEXT: sb s5, 1(a2) +; RV32I-NEXT: sb s4, 2(a2) ; RV32I-NEXT: sb s1, 3(a2) ; RV32I-NEXT: srli t0, a7, 24 ; RV32I-NEXT: srli t1, a7, 16 @@ -2152,17 +2251,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -144 -; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -160 +; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -2179,123 +2280,144 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s1, 13(a0) ; RV64I-NEXT: lbu s2, 14(a0) ; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: lbu s4, 16(a0) ; RV64I-NEXT: lbu s5, 17(a0) ; RV64I-NEXT: lbu s6, 18(a0) ; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu s9, 21(a0) +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: lbu s11, 23(a0) ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t5, t5, 16 ; RV64I-NEXT: slli t6, t6, 24 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: lbu t5, 20(a0) -; RV64I-NEXT: lbu t6, 21(a0) -; RV64I-NEXT: lbu s8, 22(a0) -; RV64I-NEXT: lbu s9, 23(a0) ; RV64I-NEXT: slli s1, s1, 8 ; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: lbu t6, 24(a0) +; RV64I-NEXT: lbu s0, 25(a0) +; RV64I-NEXT: lbu s1, 26(a0) +; RV64I-NEXT: lbu s2, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s6, s6, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or t1, s1, s0 -; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: slli s9, s9, 8 ; RV64I-NEXT: or t3, s5, s4 ; RV64I-NEXT: or t4, s7, s6 -; RV64I-NEXT: lbu s0, 24(a0) -; RV64I-NEXT: lbu s1, 25(a0) -; RV64I-NEXT: lbu s2, 26(a0) -; RV64I-NEXT: lbu s3, 27(a0) -; RV64I-NEXT: slli t6, t6, 8 -; RV64I-NEXT: slli s8, s8, 16 -; RV64I-NEXT: slli s9, s9, 24 -; RV64I-NEXT: slli s1, s1, 8 -; RV64I-NEXT: or t5, t6, t5 -; RV64I-NEXT: or t6, s9, s8 -; RV64I-NEXT: or s0, s1, s0 -; RV64I-NEXT: lbu s1, 28(a0) +; RV64I-NEXT: or t5, s9, s8 +; RV64I-NEXT: lbu s3, 28(a0) ; RV64I-NEXT: lbu s4, 29(a0) ; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu s6, 31(a0) -; RV64I-NEXT: lbu a0, 0(a1) -; RV64I-NEXT: slli s2, s2, 16 -; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: or a1, s3, s2 -; RV64I-NEXT: mv s2, sp +; RV64I-NEXT: slli s10, s10, 16 +; RV64I-NEXT: slli s11, s11, 24 +; RV64I-NEXT: slli s0, s0, 8 +; RV64I-NEXT: slli s1, s1, 16 +; RV64I-NEXT: slli s2, s2, 24 ; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or a0, s11, s10 +; RV64I-NEXT: or t6, s0, t6 +; RV64I-NEXT: or s0, s2, s1 +; RV64I-NEXT: or s1, s4, s3 +; RV64I-NEXT: lbu s2, 0(a1) +; RV64I-NEXT: lbu s3, 1(a1) +; RV64I-NEXT: lbu s4, 2(a1) +; RV64I-NEXT: lbu s7, 3(a1) ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: or s1, s4, s1 -; RV64I-NEXT: srli s3, a0, 3 -; RV64I-NEXT: or s4, s6, s5 -; RV64I-NEXT: andi s5, a0, 63 -; RV64I-NEXT: andi s3, s3, 24 -; RV64I-NEXT: xori s5, s5, 63 +; RV64I-NEXT: slli s3, s3, 8 +; RV64I-NEXT: slli s4, s4, 16 +; RV64I-NEXT: slli s7, s7, 24 +; RV64I-NEXT: or s5, s6, s5 +; RV64I-NEXT: or s2, s3, s2 +; RV64I-NEXT: or s3, s7, s4 +; RV64I-NEXT: lbu s4, 5(a1) +; RV64I-NEXT: lbu s6, 4(a1) +; RV64I-NEXT: lbu s7, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: or s4, s4, s6 +; RV64I-NEXT: slli s7, s7, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, s7 +; RV64I-NEXT: mv s6, sp ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a5, t0, a7 ; RV64I-NEXT: or a6, t2, t1 ; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: or a1, a1, s0 -; RV64I-NEXT: or t1, s4, s1 -; RV64I-NEXT: add s2, s2, s3 +; RV64I-NEXT: or a0, a0, t5 +; RV64I-NEXT: or t0, s0, t6 +; RV64I-NEXT: or t1, s5, s1 +; RV64I-NEXT: or t2, s3, s2 +; RV64I-NEXT: or a1, a1, s4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: slli t0, t0, 32 -; RV64I-NEXT: slli t2, t1, 32 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: slli t3, t1, 32 +; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: sraiw t1, t1, 31 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a1, t2, a1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: or a5, t3, t0 +; RV64I-NEXT: or a1, a1, t2 ; RV64I-NEXT: sd t1, 32(sp) ; RV64I-NEXT: sd t1, 40(sp) ; RV64I-NEXT: sd t1, 48(sp) ; RV64I-NEXT: sd t1, 56(sp) ; RV64I-NEXT: sd a3, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a5, 16(sp) -; RV64I-NEXT: sd a1, 24(sp) -; RV64I-NEXT: ld a1, 8(s2) -; RV64I-NEXT: ld a3, 16(s2) -; RV64I-NEXT: ld a4, 0(s2) -; RV64I-NEXT: ld a5, 24(s2) -; RV64I-NEXT: srl a6, a1, a0 -; RV64I-NEXT: slli a7, a3, 1 -; RV64I-NEXT: srl a4, a4, a0 -; RV64I-NEXT: slli a1, a1, 1 -; RV64I-NEXT: srl a3, a3, a0 +; RV64I-NEXT: sd a0, 16(sp) +; RV64I-NEXT: sd a5, 24(sp) +; RV64I-NEXT: srli a0, a1, 3 +; RV64I-NEXT: andi a3, a1, 63 +; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: xori a3, a3, 63 +; RV64I-NEXT: add a0, s6, a0 +; RV64I-NEXT: ld a4, 8(a0) +; RV64I-NEXT: ld a5, 16(a0) +; RV64I-NEXT: ld a6, 0(a0) +; RV64I-NEXT: ld a0, 24(a0) +; RV64I-NEXT: srl a7, a4, a1 ; RV64I-NEXT: slli t0, a5, 1 -; RV64I-NEXT: sra a5, a5, a0 -; RV64I-NEXT: sll a0, a7, s5 -; RV64I-NEXT: sll a1, a1, s5 -; RV64I-NEXT: sll a7, t0, s5 -; RV64I-NEXT: srli t0, a5, 56 -; RV64I-NEXT: srli t1, a5, 48 -; RV64I-NEXT: srli t2, a5, 40 -; RV64I-NEXT: srli t3, a5, 32 -; RV64I-NEXT: srli t4, a5, 24 -; RV64I-NEXT: srli t5, a5, 16 -; RV64I-NEXT: srli t6, a5, 8 -; RV64I-NEXT: or a0, a6, a0 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: or a3, a3, a7 +; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: srl a5, a5, a1 +; RV64I-NEXT: slli t1, a0, 1 +; RV64I-NEXT: sra t2, a0, a1 +; RV64I-NEXT: sll a0, t0, a3 +; RV64I-NEXT: sll a1, a4, a3 +; RV64I-NEXT: sll a3, t1, a3 +; RV64I-NEXT: srli a4, t2, 56 +; RV64I-NEXT: srli t0, t2, 48 +; RV64I-NEXT: srli t1, t2, 40 +; RV64I-NEXT: srli t3, t2, 32 +; RV64I-NEXT: srli t4, t2, 24 +; RV64I-NEXT: srli t5, t2, 16 +; RV64I-NEXT: srli t6, t2, 8 +; RV64I-NEXT: or a0, a7, a0 +; RV64I-NEXT: or a1, a6, a1 +; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: sb t3, 28(a2) -; RV64I-NEXT: sb t2, 29(a2) -; RV64I-NEXT: sb t1, 30(a2) -; RV64I-NEXT: sb t0, 31(a2) -; RV64I-NEXT: sb a5, 24(a2) +; RV64I-NEXT: sb t1, 29(a2) +; RV64I-NEXT: sb t0, 30(a2) +; RV64I-NEXT: sb a4, 31(a2) +; RV64I-NEXT: sb t2, 24(a2) ; RV64I-NEXT: sb t6, 25(a2) ; RV64I-NEXT: sb t5, 26(a2) ; RV64I-NEXT: sb t4, 27(a2) @@ -2316,45 +2438,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli s3, a0, 56 ; RV64I-NEXT: srli s4, a0, 48 ; RV64I-NEXT: srli s5, a0, 40 +; RV64I-NEXT: srli s6, a0, 32 ; RV64I-NEXT: sb a7, 20(a2) ; RV64I-NEXT: sb a6, 21(a2) ; RV64I-NEXT: sb a5, 22(a2) ; RV64I-NEXT: sb a4, 23(a2) -; RV64I-NEXT: srli a4, a0, 32 +; RV64I-NEXT: srli a4, a0, 24 ; RV64I-NEXT: sb a3, 16(a2) ; RV64I-NEXT: sb t2, 17(a2) ; RV64I-NEXT: sb t1, 18(a2) ; RV64I-NEXT: sb t0, 19(a2) -; RV64I-NEXT: srli a3, a0, 24 +; RV64I-NEXT: srli a3, a0, 16 ; RV64I-NEXT: sb t6, 4(a2) ; RV64I-NEXT: sb t5, 5(a2) ; RV64I-NEXT: sb t4, 6(a2) ; RV64I-NEXT: sb t3, 7(a2) -; RV64I-NEXT: srli a5, a0, 16 +; RV64I-NEXT: srli a5, a0, 8 ; RV64I-NEXT: sb a1, 0(a2) ; RV64I-NEXT: sb s2, 1(a2) ; RV64I-NEXT: sb s1, 2(a2) ; RV64I-NEXT: sb s0, 3(a2) -; RV64I-NEXT: srli a1, a0, 8 -; RV64I-NEXT: sb a4, 12(a2) +; RV64I-NEXT: sb s6, 12(a2) ; RV64I-NEXT: sb s5, 13(a2) ; RV64I-NEXT: sb s4, 14(a2) ; RV64I-NEXT: sb s3, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb a5, 10(a2) -; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 144 +; RV64I-NEXT: sb a5, 9(a2) +; RV64I-NEXT: sb a3, 10(a2) +; RV64I-NEXT: sb a4, 11(a2) +; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 160 ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_32bytes: @@ -2379,148 +2503,159 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a7, 3(a0) ; RV32I-NEXT: lbu a5, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t3, 6(a0) -; RV32I-NEXT: lbu t4, 7(a0) -; RV32I-NEXT: lbu t6, 8(a0) -; RV32I-NEXT: lbu s0, 9(a0) -; RV32I-NEXT: lbu s4, 10(a0) -; RV32I-NEXT: lbu s5, 11(a0) -; RV32I-NEXT: lbu s6, 12(a0) -; RV32I-NEXT: lbu s7, 13(a0) -; RV32I-NEXT: lbu s8, 14(a0) -; RV32I-NEXT: lbu s9, 15(a0) -; RV32I-NEXT: lbu s10, 16(a0) -; RV32I-NEXT: lbu s11, 17(a0) -; RV32I-NEXT: lbu s2, 18(a0) -; RV32I-NEXT: lbu s3, 19(a0) +; RV32I-NEXT: lbu t1, 6(a0) +; RV32I-NEXT: lbu t2, 7(a0) +; RV32I-NEXT: lbu t3, 8(a0) +; RV32I-NEXT: lbu t4, 9(a0) +; RV32I-NEXT: lbu t5, 10(a0) +; RV32I-NEXT: lbu t6, 11(a0) +; RV32I-NEXT: lbu s0, 12(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: lbu s3, 15(a0) +; RV32I-NEXT: lbu s4, 16(a0) +; RV32I-NEXT: lbu s5, 17(a0) +; RV32I-NEXT: lbu s6, 18(a0) +; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 ; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: or a4, a7, a6 -; RV32I-NEXT: lbu t1, 20(a0) -; RV32I-NEXT: lbu t2, 21(a0) -; RV32I-NEXT: lbu t5, 22(a0) -; RV32I-NEXT: lbu s1, 23(a0) +; RV32I-NEXT: lbu s8, 20(a0) +; RV32I-NEXT: lbu s9, 21(a0) +; RV32I-NEXT: lbu s10, 22(a0) +; RV32I-NEXT: lbu s11, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s4, s4, 16 -; RV32I-NEXT: slli s5, s5, 24 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 ; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t4, t3 -; RV32I-NEXT: or a7, s0, t6 -; RV32I-NEXT: or t0, s5, s4 -; RV32I-NEXT: lbu t3, 24(a0) -; RV32I-NEXT: lbu s4, 25(a0) -; RV32I-NEXT: lbu s5, 26(a0) -; RV32I-NEXT: lbu ra, 27(a0) -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: or t4, s7, s6 -; RV32I-NEXT: or t6, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: lbu s6, 28(a0) -; RV32I-NEXT: lbu s7, 29(a0) -; RV32I-NEXT: lbu s8, 30(a0) -; RV32I-NEXT: lbu s9, 31(a0) -; RV32I-NEXT: lbu a0, 0(a1) +; RV32I-NEXT: or a6, t2, t1 +; RV32I-NEXT: or a7, t4, t3 +; RV32I-NEXT: or t0, t6, t5 +; RV32I-NEXT: lbu ra, 24(a0) +; RV32I-NEXT: lbu a3, 25(a0) +; RV32I-NEXT: lbu t4, 26(a0) +; RV32I-NEXT: lbu t5, 27(a0) +; RV32I-NEXT: slli s1, s1, 8 ; RV32I-NEXT: slli s2, s2, 16 ; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: or s2, s3, s2 -; RV32I-NEXT: addi s3, sp, 8 -; RV32I-NEXT: slli t2, t2, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 16 -; RV32I-NEXT: slli ra, ra, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: or t1, t2, t1 -; RV32I-NEXT: srli a1, a0, 3 +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: or t1, s1, s0 +; RV32I-NEXT: or t2, s3, s2 +; RV32I-NEXT: or t3, s5, s4 +; RV32I-NEXT: lbu t6, 28(a0) +; RV32I-NEXT: lbu s0, 29(a0) +; RV32I-NEXT: lbu s1, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli s7, s7, 24 +; RV32I-NEXT: slli s9, s9, 8 +; RV32I-NEXT: slli s10, s10, 16 +; RV32I-NEXT: slli s11, s11, 24 +; RV32I-NEXT: or s2, s7, s6 +; RV32I-NEXT: or s3, s9, s8 +; RV32I-NEXT: or s4, s11, s10 +; RV32I-NEXT: lbu s5, 0(a1) +; RV32I-NEXT: lbu s6, 1(a1) +; RV32I-NEXT: lbu s7, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, ra +; RV32I-NEXT: addi s8, sp, 8 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s1, s1, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: slli s6, s6, 8 +; RV32I-NEXT: slli s7, s7, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or t4, t5, t4 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or s1, a0, s1 +; RV32I-NEXT: or t6, s6, s5 +; RV32I-NEXT: or a1, a1, s7 +; RV32I-NEXT: srai s0, a0, 31 +; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: or a4, a4, a0 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or t0, s2, t3 +; RV32I-NEXT: or t1, s4, s3 +; RV32I-NEXT: or a3, t4, a3 ; RV32I-NEXT: or t2, s1, t5 -; RV32I-NEXT: andi t5, a0, 31 -; RV32I-NEXT: or t3, s4, t3 -; RV32I-NEXT: or s1, ra, s5 -; RV32I-NEXT: or s4, s7, s6 -; RV32I-NEXT: or s5, s9, s8 -; RV32I-NEXT: srai s6, s9, 31 -; RV32I-NEXT: andi s7, a1, 28 -; RV32I-NEXT: xori a1, t5, 31 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, t6, t4 -; RV32I-NEXT: or a7, s2, s0 -; RV32I-NEXT: or t0, t2, t1 -; RV32I-NEXT: or t1, s1, t3 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: sw s6, 56(sp) -; RV32I-NEXT: sw s6, 60(sp) -; RV32I-NEXT: sw s6, 64(sp) -; RV32I-NEXT: sw s6, 68(sp) -; RV32I-NEXT: sw s6, 40(sp) -; RV32I-NEXT: sw s6, 44(sp) -; RV32I-NEXT: sw s6, 48(sp) -; RV32I-NEXT: sw s6, 52(sp) -; RV32I-NEXT: add s3, s3, s7 -; RV32I-NEXT: sw a7, 24(sp) -; RV32I-NEXT: sw t0, 28(sp) -; RV32I-NEXT: sw t1, 32(sp) +; RV32I-NEXT: or a0, a1, t6 +; RV32I-NEXT: sw s0, 56(sp) +; RV32I-NEXT: sw s0, 60(sp) +; RV32I-NEXT: sw s0, 64(sp) +; RV32I-NEXT: sw s0, 68(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s0, 44(sp) +; RV32I-NEXT: sw s0, 48(sp) +; RV32I-NEXT: sw s0, 52(sp) +; RV32I-NEXT: sw t0, 24(sp) +; RV32I-NEXT: sw t1, 28(sp) +; RV32I-NEXT: sw a3, 32(sp) ; RV32I-NEXT: sw t2, 36(sp) -; RV32I-NEXT: sw a3, 8(sp) -; RV32I-NEXT: sw a4, 12(sp) -; RV32I-NEXT: sw a5, 16(sp) -; RV32I-NEXT: sw a6, 20(sp) -; RV32I-NEXT: lw a3, 0(s3) -; RV32I-NEXT: lw a4, 4(s3) -; RV32I-NEXT: lw a5, 8(s3) -; RV32I-NEXT: lw a6, 12(s3) -; RV32I-NEXT: lw a7, 16(s3) -; RV32I-NEXT: lw t0, 20(s3) -; RV32I-NEXT: lw t1, 24(s3) -; RV32I-NEXT: lw t2, 28(s3) -; RV32I-NEXT: srl t3, a4, a0 -; RV32I-NEXT: slli t4, a5, 1 +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 12(sp) +; RV32I-NEXT: sw a6, 16(sp) +; RV32I-NEXT: sw a7, 20(sp) +; RV32I-NEXT: srli a1, a0, 3 +; RV32I-NEXT: andi a3, a0, 31 +; RV32I-NEXT: andi a4, a1, 28 +; RV32I-NEXT: xori a1, a3, 31 +; RV32I-NEXT: add a4, s8, a4 +; RV32I-NEXT: lw a3, 0(a4) +; RV32I-NEXT: lw a5, 4(a4) +; RV32I-NEXT: lw a6, 8(a4) +; RV32I-NEXT: lw a7, 12(a4) +; RV32I-NEXT: lw t0, 16(a4) +; RV32I-NEXT: lw t1, 20(a4) +; RV32I-NEXT: lw t2, 24(a4) +; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: srl t3, a5, a0 +; RV32I-NEXT: slli t4, a6, 1 ; RV32I-NEXT: srl a3, a3, a0 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: srl t5, a6, a0 -; RV32I-NEXT: slli t6, a7, 1 -; RV32I-NEXT: srl a5, a5, a0 -; RV32I-NEXT: slli a6, a6, 1 -; RV32I-NEXT: srl s0, t0, a0 -; RV32I-NEXT: slli s1, t1, 1 -; RV32I-NEXT: srl a7, a7, a0 -; RV32I-NEXT: slli t0, t0, 1 -; RV32I-NEXT: srl t1, t1, a0 -; RV32I-NEXT: slli s2, t2, 1 -; RV32I-NEXT: sra t2, t2, a0 +; RV32I-NEXT: slli a5, a5, 1 +; RV32I-NEXT: srl t5, a7, a0 +; RV32I-NEXT: slli t6, t0, 1 +; RV32I-NEXT: srl a6, a6, a0 +; RV32I-NEXT: slli a7, a7, 1 +; RV32I-NEXT: srl s0, t1, a0 +; RV32I-NEXT: slli s1, t2, 1 +; RV32I-NEXT: srl t0, t0, a0 +; RV32I-NEXT: slli t1, t1, 1 +; RV32I-NEXT: srl t2, t2, a0 +; RV32I-NEXT: slli s2, a4, 1 +; RV32I-NEXT: sra s3, a4, a0 ; RV32I-NEXT: sll a0, t4, a1 -; RV32I-NEXT: sll a4, a4, a1 -; RV32I-NEXT: sll t4, t6, a1 -; RV32I-NEXT: sll a6, a6, a1 -; RV32I-NEXT: sll t6, s1, a1 -; RV32I-NEXT: sll t0, t0, a1 -; RV32I-NEXT: sll s1, s2, a1 -; RV32I-NEXT: srli s2, t2, 24 -; RV32I-NEXT: srli s3, t2, 16 -; RV32I-NEXT: srli s4, t2, 8 +; RV32I-NEXT: sll a4, a5, a1 +; RV32I-NEXT: sll a5, t6, a1 +; RV32I-NEXT: sll a7, a7, a1 +; RV32I-NEXT: sll t4, s1, a1 +; RV32I-NEXT: sll t1, t1, a1 +; RV32I-NEXT: sll t6, s2, a1 +; RV32I-NEXT: srli s1, s3, 24 +; RV32I-NEXT: srli s2, s3, 16 +; RV32I-NEXT: srli s4, s3, 8 ; RV32I-NEXT: or a0, t3, a0 ; RV32I-NEXT: or a1, a3, a4 -; RV32I-NEXT: or a3, t5, t4 -; RV32I-NEXT: or a4, a5, a6 -; RV32I-NEXT: or a5, s0, t6 -; RV32I-NEXT: or a6, a7, t0 -; RV32I-NEXT: or a7, t1, s1 -; RV32I-NEXT: sb t2, 28(a2) +; RV32I-NEXT: or a3, t5, a5 +; RV32I-NEXT: or a4, a6, a7 +; RV32I-NEXT: or a5, s0, t4 +; RV32I-NEXT: or a6, t0, t1 +; RV32I-NEXT: or a7, t2, t6 +; RV32I-NEXT: sb s3, 28(a2) ; RV32I-NEXT: sb s4, 29(a2) -; RV32I-NEXT: sb s3, 30(a2) -; RV32I-NEXT: sb s2, 31(a2) +; RV32I-NEXT: sb s2, 30(a2) +; RV32I-NEXT: sb s1, 31(a2) ; RV32I-NEXT: srli t0, a7, 24 ; RV32I-NEXT: srli t1, a7, 16 ; RV32I-NEXT: srli t2, a7, 8 diff --git a/llvm/test/CodeGen/RISCV/xqciac.ll b/llvm/test/CodeGen/RISCV/xqciac.ll index a3b4e78..4c77b39 100644 --- a/llvm/test/CodeGen/RISCV/xqciac.ll +++ b/llvm/test/CodeGen/RISCV/xqciac.ll @@ -231,12 +231,12 @@ define dso_local i32 @pow2(i32 %a, i32 %b) local_unnamed_addr #0 { ; ; RV32IMXQCIAC-LABEL: pow2: ; RV32IMXQCIAC: # %bb.0: # %entry -; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: pow2: ; RV32IZBAMXQCIAC: # %bb.0: # %entry -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5 ; RV32IZBAMXQCIAC-NEXT: ret entry: %mul = mul nsw i32 %b, 32 @@ -276,12 +276,12 @@ define dso_local i32 @shladd(i32 %a, i32 %b) local_unnamed_addr #0 { ; ; RV32IMXQCIAC-LABEL: shladd: ; RV32IMXQCIAC: # %bb.0: # %entry -; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 31 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 31 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shladd: ; RV32IZBAMXQCIAC: # %bb.0: # %entry -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 31 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 31 ; RV32IZBAMXQCIAC-NEXT: ret entry: %shl = shl nsw i32 %b, 31 @@ -305,9 +305,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 { ; RV32IMXQCIAC-LABEL: shladd64: ; RV32IMXQCIAC: # %bb.0: # %entry ; RV32IMXQCIAC-NEXT: srli a4, a2, 1 -; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a2, 31 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a2, a0, 31 ; RV32IMXQCIAC-NEXT: slli a2, a2, 31 -; RV32IMXQCIAC-NEXT: qc.shladd a3, a4, a3, 31 +; RV32IMXQCIAC-NEXT: qc.shladd a3, a3, a4, 31 ; RV32IMXQCIAC-NEXT: sltu a2, a0, a2 ; RV32IMXQCIAC-NEXT: add a1, a1, a3 ; RV32IMXQCIAC-NEXT: add a1, a1, a2 @@ -316,9 +316,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 { ; RV32IZBAMXQCIAC-LABEL: shladd64: ; RV32IZBAMXQCIAC: # %bb.0: # %entry ; RV32IZBAMXQCIAC-NEXT: srli a4, a2, 1 -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a2, 31 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a2, a0, 31 ; RV32IZBAMXQCIAC-NEXT: slli a2, a2, 31 -; RV32IZBAMXQCIAC-NEXT: qc.shladd a3, a4, a3, 31 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a3, a3, a4, 31 ; RV32IZBAMXQCIAC-NEXT: sltu a2, a0, a2 ; RV32IZBAMXQCIAC-NEXT: add a1, a1, a3 ; RV32IZBAMXQCIAC-NEXT: add a1, a1, a2 @@ -338,12 +338,12 @@ define dso_local i32 @shladd_ordisjoint(i32 %a, i32 %b) local_unnamed_addr #0 { ; ; RV32IMXQCIAC-LABEL: shladd_ordisjoint: ; RV32IMXQCIAC: # %bb.0: # %entry -; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 22 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 22 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shladd_ordisjoint: ; RV32IZBAMXQCIAC: # %bb.0: # %entry -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 22 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 22 ; RV32IZBAMXQCIAC-NEXT: ret entry: %shl = shl nsw i32 %b, 22 @@ -361,13 +361,13 @@ define dso_local i32 @shladdc1c2(i32 %a, i32 %b) local_unnamed_addr #0 { ; ; RV32IMXQCIAC-LABEL: shladdc1c2: ; RV32IMXQCIAC: # %bb.0: # %entry -; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5 ; RV32IMXQCIAC-NEXT: slli a0, a0, 26 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shladdc1c2: ; RV32IZBAMXQCIAC: # %bb.0: # %entry -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 5 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 5 ; RV32IZBAMXQCIAC-NEXT: slli a0, a0, 26 ; RV32IZBAMXQCIAC-NEXT: ret entry: @@ -388,7 +388,7 @@ define dso_local i32 @shxaddc1c2(i32 %a, i32 %b) local_unnamed_addr #0 { ; RV32IMXQCIAC-LABEL: shxaddc1c2: ; RV32IMXQCIAC: # %bb.0: # %entry ; RV32IMXQCIAC-NEXT: slli a1, a1, 28 -; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 31 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 31 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shxaddc1c2: @@ -417,18 +417,18 @@ define dso_local i64 @shladdc1c264(i64 %a, i64 %b) local_unnamed_addr #0 { ; RV32IMXQCIAC-LABEL: shladdc1c264: ; RV32IMXQCIAC: # %bb.0: # %entry ; RV32IMXQCIAC-NEXT: srli a1, a2, 12 -; RV32IMXQCIAC-NEXT: qc.shladd a1, a1, a3, 20 +; RV32IMXQCIAC-NEXT: qc.shladd a1, a3, a1, 20 ; RV32IMXQCIAC-NEXT: slli a2, a2, 20 -; RV32IMXQCIAC-NEXT: qc.shladd a1, a1, a0, 23 +; RV32IMXQCIAC-NEXT: qc.shladd a1, a0, a1, 23 ; RV32IMXQCIAC-NEXT: mv a0, a2 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shladdc1c264: ; RV32IZBAMXQCIAC: # %bb.0: # %entry ; RV32IZBAMXQCIAC-NEXT: srli a1, a2, 12 -; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a1, a3, 20 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a3, a1, 20 ; RV32IZBAMXQCIAC-NEXT: slli a2, a2, 20 -; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a1, a0, 23 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a1, a0, a1, 23 ; RV32IZBAMXQCIAC-NEXT: mv a0, a2 ; RV32IZBAMXQCIAC-NEXT: ret entry: @@ -449,13 +449,13 @@ define dso_local i32 @shladdc1equalc2(i32 %a, i32 %b) local_unnamed_addr #0 { ; RV32IMXQCIAC-LABEL: shladdc1equalc2: ; RV32IMXQCIAC: # %bb.0: # %entry ; RV32IMXQCIAC-NEXT: slli a1, a1, 12 -; RV32IMXQCIAC-NEXT: qc.shladd a0, a1, a0, 12 +; RV32IMXQCIAC-NEXT: qc.shladd a0, a0, a1, 12 ; RV32IMXQCIAC-NEXT: ret ; ; RV32IZBAMXQCIAC-LABEL: shladdc1equalc2: ; RV32IZBAMXQCIAC: # %bb.0: # %entry ; RV32IZBAMXQCIAC-NEXT: slli a1, a1, 12 -; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a1, a0, 12 +; RV32IZBAMXQCIAC-NEXT: qc.shladd a0, a0, a1, 12 ; RV32IZBAMXQCIAC-NEXT: ret entry: %shlc1 = shl nsw i32 %a, 12 diff --git a/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll index cdaae23..5724c4f 100644 --- a/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadfmemidx.ll @@ -1,33 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d -mattr=+xtheadfmemidx -mattr=+m -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=RV32XTHEADMEMIDX -; RUN: llc -mtriple=riscv64 -mattr=+d -mattr=+xtheadfmemidx -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=RV64XTHEADFMEMIDX +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+d,+xtheadfmemidx \ +; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32XTHEADFMEMIDX +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d,+xtheadfmemidx \ +; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64XTHEADFMEMIDX -define float @flrw(ptr %a, i64 %b) { -; RV32XTHEADMEMIDX-LABEL: flrw: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.flrw fa5, a0, a1, 2 -; RV32XTHEADMEMIDX-NEXT: fadd.s fa0, fa5, fa5 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADFMEMIDX-LABEL: flrw: -; RV64XTHEADFMEMIDX: # %bb.0: -; RV64XTHEADFMEMIDX-NEXT: th.flrw fa5, a0, a1, 2 -; RV64XTHEADFMEMIDX-NEXT: fadd.s fa0, fa5, fa5 -; RV64XTHEADFMEMIDX-NEXT: ret - %1 = getelementptr float, ptr %a, i64 %b +define float @flrw(ptr %a, iXLen %b) { +; CHECK-LABEL: flrw: +; CHECK: # %bb.0: +; CHECK-NEXT: th.flrw fa5, a0, a1, 2 +; CHECK-NEXT: fadd.s fa0, fa5, fa5 +; CHECK-NEXT: ret + %1 = getelementptr float, ptr %a, iXLen %b %2 = load float, ptr %1, align 4 %3 = fadd float %2, %2 ret float %3 } define float @flurw(ptr %a, i32 %b) { -; RV32XTHEADMEMIDX-LABEL: flurw: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.flrw fa5, a0, a1, 2 -; RV32XTHEADMEMIDX-NEXT: fadd.s fa0, fa5, fa5 -; RV32XTHEADMEMIDX-NEXT: ret +; RV32XTHEADFMEMIDX-LABEL: flurw: +; RV32XTHEADFMEMIDX: # %bb.0: +; RV32XTHEADFMEMIDX-NEXT: th.flrw fa5, a0, a1, 2 +; RV32XTHEADFMEMIDX-NEXT: fadd.s fa0, fa5, fa5 +; RV32XTHEADFMEMIDX-NEXT: ret ; ; RV64XTHEADFMEMIDX-LABEL: flurw: ; RV64XTHEADFMEMIDX: # %bb.0: @@ -41,30 +35,24 @@ define float @flurw(ptr %a, i32 %b) { ret float %4 } -define void @fsrw(ptr %a, i64 %b, float %c) { -; RV32XTHEADMEMIDX-LABEL: fsrw: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: fadd.s fa5, fa0, fa0 -; RV32XTHEADMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADFMEMIDX-LABEL: fsrw: -; RV64XTHEADFMEMIDX: # %bb.0: -; RV64XTHEADFMEMIDX-NEXT: fadd.s fa5, fa0, fa0 -; RV64XTHEADFMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2 -; RV64XTHEADFMEMIDX-NEXT: ret +define void @fsrw(ptr %a, iXLen %b, float %c) { +; CHECK-LABEL: fsrw: +; CHECK: # %bb.0: +; CHECK-NEXT: fadd.s fa5, fa0, fa0 +; CHECK-NEXT: th.fsrw fa5, a0, a1, 2 +; CHECK-NEXT: ret %1 = fadd float %c, %c - %2 = getelementptr float, ptr %a, i64 %b + %2 = getelementptr float, ptr %a, iXLen %b store float %1, ptr %2, align 4 ret void } define void @fsurw(ptr %a, i32 %b, float %c) { -; RV32XTHEADMEMIDX-LABEL: fsurw: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: fadd.s fa5, fa0, fa0 -; RV32XTHEADMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2 -; RV32XTHEADMEMIDX-NEXT: ret +; RV32XTHEADFMEMIDX-LABEL: fsurw: +; RV32XTHEADFMEMIDX: # %bb.0: +; RV32XTHEADFMEMIDX-NEXT: fadd.s fa5, fa0, fa0 +; RV32XTHEADFMEMIDX-NEXT: th.fsrw fa5, a0, a1, 2 +; RV32XTHEADFMEMIDX-NEXT: ret ; ; RV64XTHEADFMEMIDX-LABEL: fsurw: ; RV64XTHEADFMEMIDX: # %bb.0: @@ -78,30 +66,24 @@ define void @fsurw(ptr %a, i32 %b, float %c) { ret void } -define double @flrd(ptr %a, i64 %b) { -; RV32XTHEADMEMIDX-LABEL: flrd: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.flrd fa5, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: fadd.d fa0, fa5, fa5 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADFMEMIDX-LABEL: flrd: -; RV64XTHEADFMEMIDX: # %bb.0: -; RV64XTHEADFMEMIDX-NEXT: th.flrd fa5, a0, a1, 3 -; RV64XTHEADFMEMIDX-NEXT: fadd.d fa0, fa5, fa5 -; RV64XTHEADFMEMIDX-NEXT: ret - %1 = getelementptr double, ptr %a, i64 %b +define double @flrd(ptr %a, iXLen %b) { +; CHECK-LABEL: flrd: +; CHECK: # %bb.0: +; CHECK-NEXT: th.flrd fa5, a0, a1, 3 +; CHECK-NEXT: fadd.d fa0, fa5, fa5 +; CHECK-NEXT: ret + %1 = getelementptr double, ptr %a, iXLen %b %2 = load double, ptr %1, align 8 %3 = fadd double %2, %2 ret double %3 } define double @flurd(ptr %a, i32 %b) { -; RV32XTHEADMEMIDX-LABEL: flurd: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.flrd fa5, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: fadd.d fa0, fa5, fa5 -; RV32XTHEADMEMIDX-NEXT: ret +; RV32XTHEADFMEMIDX-LABEL: flurd: +; RV32XTHEADFMEMIDX: # %bb.0: +; RV32XTHEADFMEMIDX-NEXT: th.flrd fa5, a0, a1, 3 +; RV32XTHEADFMEMIDX-NEXT: fadd.d fa0, fa5, fa5 +; RV32XTHEADFMEMIDX-NEXT: ret ; ; RV64XTHEADFMEMIDX-LABEL: flurd: ; RV64XTHEADFMEMIDX: # %bb.0: @@ -115,30 +97,24 @@ define double @flurd(ptr %a, i32 %b) { ret double %4 } -define void @fsrd(ptr %a, i64 %b, double %c) { -; RV32XTHEADMEMIDX-LABEL: fsrd: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: fadd.d fa5, fa0, fa0 -; RV32XTHEADMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADFMEMIDX-LABEL: fsrd: -; RV64XTHEADFMEMIDX: # %bb.0: -; RV64XTHEADFMEMIDX-NEXT: fadd.d fa5, fa0, fa0 -; RV64XTHEADFMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3 -; RV64XTHEADFMEMIDX-NEXT: ret +define void @fsrd(ptr %a, iXLen %b, double %c) { +; CHECK-LABEL: fsrd: +; CHECK: # %bb.0: +; CHECK-NEXT: fadd.d fa5, fa0, fa0 +; CHECK-NEXT: th.fsrd fa5, a0, a1, 3 +; CHECK-NEXT: ret %1 = fadd double %c, %c - %2 = getelementptr double, ptr %a, i64 %b + %2 = getelementptr double, ptr %a, iXLen %b store double %1, ptr %2, align 8 ret void } define void @fsurd(ptr %a, i32 %b, double %c) { -; RV32XTHEADMEMIDX-LABEL: fsurd: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: fadd.d fa5, fa0, fa0 -; RV32XTHEADMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: ret +; RV32XTHEADFMEMIDX-LABEL: fsurd: +; RV32XTHEADFMEMIDX: # %bb.0: +; RV32XTHEADFMEMIDX-NEXT: fadd.d fa5, fa0, fa0 +; RV32XTHEADFMEMIDX-NEXT: th.fsrd fa5, a0, a1, 3 +; RV32XTHEADFMEMIDX-NEXT: ret ; ; RV64XTHEADFMEMIDX-LABEL: fsurd: ; RV64XTHEADFMEMIDX: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll index fc20fcb..a20b08a 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll @@ -1,238 +1,156 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d -mattr=+xtheadmemidx -mattr=+m -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=RV32XTHEADMEMIDX -; RUN: llc -mtriple=riscv64 -mattr=+d -mattr=+xtheadmemidx -mattr=+m -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=RV64XTHEADMEMIDX +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+d,+xtheadmemidx \ +; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32XTHEADMEMIDX +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d,+xtheadmemidx \ +; RUN: -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64XTHEADMEMIDX define ptr @lbia(ptr %base, ptr %addr.2, i8 %a) { -; RV32XTHEADMEMIDX-LABEL: lbia: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lbia a3, (a0), -1, 0 -; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2 -; RV32XTHEADMEMIDX-NEXT: sb a2, 0(a1) -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lbia: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lbia a3, (a0), -1, 0 -; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2 -; RV64XTHEADMEMIDX-NEXT: sb a2, 0(a1) -; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i8, ptr %base, i8 0 +; CHECK-LABEL: lbia: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lbia a3, (a0), -1, 0 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: sb a2, 0(a1) +; CHECK-NEXT: ret + %addr = getelementptr i8, ptr %base, iXLen 0 %ld = load i8, ptr %addr - %addr.1 = getelementptr i8, ptr %base, i8 -1 + %addr.1 = getelementptr i8, ptr %base, iXLen -1 %res = add i8 %ld, %a store i8 %res, ptr %addr.2 ret ptr %addr.1 } define ptr @lbib(ptr %base, i8 %a) { -; RV32XTHEADMEMIDX-LABEL: lbib: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lbib a2, (a0), 1, 0 -; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 -; RV32XTHEADMEMIDX-NEXT: sb a1, 1(a0) -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lbib: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lbib a2, (a0), 1, 0 -; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1 -; RV64XTHEADMEMIDX-NEXT: sb a1, 1(a0) -; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i8, ptr %base, i8 1 +; CHECK-LABEL: lbib: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lbib a2, (a0), 1, 0 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: sb a1, 1(a0) +; CHECK-NEXT: ret + %addr = getelementptr i8, ptr %base, iXLen 1 %ld = load i8, ptr %addr - %addr.1 = getelementptr i8, ptr %base, i8 2 + %addr.1 = getelementptr i8, ptr %base, iXLen 2 %res = add i8 %ld, %a store i8 %res, ptr %addr.1 ret ptr %addr } -define ptr @lbuia(ptr %base, ptr %addr.2, i64 %a) { -; RV32XTHEADMEMIDX-LABEL: lbuia: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lbuia a4, (a0), -1, 0 -; RV32XTHEADMEMIDX-NEXT: add a2, a4, a2 -; RV32XTHEADMEMIDX-NEXT: sltu a4, a2, a4 -; RV32XTHEADMEMIDX-NEXT: add a3, a3, a4 -; RV32XTHEADMEMIDX-NEXT: sw a2, 0(a1) -; RV32XTHEADMEMIDX-NEXT: sw a3, 4(a1) -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lbuia: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lbuia a3, (a0), -1, 0 -; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2 -; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1) -; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i8, ptr %base, i8 0 +define ptr @lbuia(ptr %base, ptr %addr.2, i32 %a) { +; CHECK-LABEL: lbuia: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lbuia a3, (a0), -1, 0 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: sw a2, 0(a1) +; CHECK-NEXT: ret + %addr = getelementptr i8, ptr %base, iXLen 0 %ld = load i8, ptr %addr - %zext = zext i8 %ld to i64 - %addr.1 = getelementptr i8, ptr %base, i8 -1 - %res = add i64 %zext, %a - store i64 %res, ptr %addr.2 + %zext = zext i8 %ld to i32 + %addr.1 = getelementptr i8, ptr %base, iXLen -1 + %res = add i32 %zext, %a + store i32 %res, ptr %addr.2 ret ptr %addr.1 } -define ptr @lbuib(ptr %base, i64 %a, ptr %addr.1) { -; RV32XTHEADMEMIDX-LABEL: lbuib: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lbuib a4, (a0), 1, 0 -; RV32XTHEADMEMIDX-NEXT: add a1, a4, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a4, a1, a4 -; RV32XTHEADMEMIDX-NEXT: add a2, a2, a4 -; RV32XTHEADMEMIDX-NEXT: sw a1, 0(a3) -; RV32XTHEADMEMIDX-NEXT: sw a2, 4(a3) -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lbuib: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lbuib a3, (a0), 1, 0 -; RV64XTHEADMEMIDX-NEXT: add a1, a3, a1 -; RV64XTHEADMEMIDX-NEXT: sd a1, 0(a2) -; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i8, ptr %base, i8 1 +define ptr @lbuib(ptr %base, i32 %a, ptr %addr.1) { +; CHECK-LABEL: lbuib: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lbuib a3, (a0), 1, 0 +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: sw a1, 0(a2) +; CHECK-NEXT: ret + %addr = getelementptr i8, ptr %base, iXLen 1 %ld = load i8, ptr %addr - %zext = zext i8 %ld to i64 - %res = add i64 %zext, %a - store i64 %res, ptr %addr.1 + %zext = zext i8 %ld to i32 + %res = add i32 %zext, %a + store i32 %res, ptr %addr.1 ret ptr %addr } define ptr @lhia(ptr %base, ptr %addr.2, i16 %a) { -; RV32XTHEADMEMIDX-LABEL: lhia: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lhia a3, (a0), -16, 1 -; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2 -; RV32XTHEADMEMIDX-NEXT: sh a2, 0(a1) -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lhia: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lhia a3, (a0), -16, 1 -; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2 -; RV64XTHEADMEMIDX-NEXT: sh a2, 0(a1) -; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i16, ptr %base, i16 0 +; CHECK-LABEL: lhia: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lhia a3, (a0), -16, 1 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: sh a2, 0(a1) +; CHECK-NEXT: ret + %addr = getelementptr i16, ptr %base, iXLen 0 %ld = load i16, ptr %addr - %addr.1 = getelementptr i16, ptr %base, i16 -16 + %addr.1 = getelementptr i16, ptr %base, iXLen -16 %res = add i16 %ld, %a store i16 %res, ptr %addr.2 ret ptr %addr.1 } define ptr @lhib(ptr %base, i16 %a) { -; RV32XTHEADMEMIDX-LABEL: lhib: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lhib a2, (a0), 2, 0 -; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 -; RV32XTHEADMEMIDX-NEXT: sh a1, 2(a0) -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lhib: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lhib a2, (a0), 2, 0 -; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1 -; RV64XTHEADMEMIDX-NEXT: sh a1, 2(a0) -; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i16, ptr %base, i16 1 +; CHECK-LABEL: lhib: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lhib a2, (a0), 2, 0 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: sh a1, 2(a0) +; CHECK-NEXT: ret + %addr = getelementptr i16, ptr %base, iXLen 1 %ld = load i16, ptr %addr - %addr.1 = getelementptr i16, ptr %base, i16 2 + %addr.1 = getelementptr i16, ptr %base, iXLen 2 %res = add i16 %ld, %a store i16 %res, ptr %addr.1 ret ptr %addr } -define ptr @lhuia(ptr %base, ptr %addr.2, i64 %a) { -; RV32XTHEADMEMIDX-LABEL: lhuia: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lhuia a4, (a0), -16, 1 -; RV32XTHEADMEMIDX-NEXT: add a2, a4, a2 -; RV32XTHEADMEMIDX-NEXT: sltu a4, a2, a4 -; RV32XTHEADMEMIDX-NEXT: add a3, a3, a4 -; RV32XTHEADMEMIDX-NEXT: sw a2, 0(a1) -; RV32XTHEADMEMIDX-NEXT: sw a3, 4(a1) -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lhuia: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lhuia a3, (a0), -16, 1 -; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2 -; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1) -; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i16, ptr %base, i16 0 +define ptr @lhuia(ptr %base, ptr %addr.2, i32 %a) { +; CHECK-LABEL: lhuia: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lhuia a3, (a0), -16, 1 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: sw a2, 0(a1) +; CHECK-NEXT: ret + %addr = getelementptr i16, ptr %base, iXLen 0 %ld = load i16, ptr %addr - %zext = zext i16 %ld to i64 - %addr.1 = getelementptr i16, ptr %base, i16 -16 - %res = add i64 %zext, %a - store i64 %res, ptr %addr.2 + %zext = zext i16 %ld to i32 + %addr.1 = getelementptr i16, ptr %base, iXLen -16 + %res = add i32 %zext, %a + store i32 %res, ptr %addr.2 ret ptr %addr.1 } -define ptr @lhuib(ptr %base, i64 %a, ptr %addr.1) { -; RV32XTHEADMEMIDX-LABEL: lhuib: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lhuib a4, (a0), 2, 0 -; RV32XTHEADMEMIDX-NEXT: add a1, a4, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a4, a1, a4 -; RV32XTHEADMEMIDX-NEXT: add a2, a2, a4 -; RV32XTHEADMEMIDX-NEXT: sw a1, 0(a3) -; RV32XTHEADMEMIDX-NEXT: sw a2, 4(a3) -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lhuib: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lhuib a3, (a0), 2, 0 -; RV64XTHEADMEMIDX-NEXT: add a1, a3, a1 -; RV64XTHEADMEMIDX-NEXT: sd a1, 0(a2) -; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i16, ptr %base, i16 1 +define ptr @lhuib(ptr %base, i32 %a, ptr %addr.1) { +; CHECK-LABEL: lhuib: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lhuib a3, (a0), 2, 0 +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: sw a1, 0(a2) +; CHECK-NEXT: ret + %addr = getelementptr i16, ptr %base, iXLen 1 %ld = load i16, ptr %addr - %zext = zext i16 %ld to i64 - %res = add i64 %zext, %a - store i64 %res, ptr %addr.1 + %zext = zext i16 %ld to i32 + %res = add i32 %zext, %a + store i32 %res, ptr %addr.1 ret ptr %addr } define ptr @lwia(ptr %base, ptr %addr.2, i32 %a) { -; RV32XTHEADMEMIDX-LABEL: lwia: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lwia a3, (a0), -16, 2 -; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2 -; RV32XTHEADMEMIDX-NEXT: sw a2, 0(a1) -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lwia: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lwia a3, (a0), -16, 2 -; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2 -; RV64XTHEADMEMIDX-NEXT: sw a2, 0(a1) -; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i32, ptr %base, i32 0 +; CHECK-LABEL: lwia: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lwia a3, (a0), -16, 2 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: sw a2, 0(a1) +; CHECK-NEXT: ret + %addr = getelementptr i32, ptr %base, iXLen 0 %ld = load i32, ptr %addr - %addr.1 = getelementptr i32, ptr %base, i32 -16 + %addr.1 = getelementptr i32, ptr %base, iXLen -16 %res = add i32 %ld, %a store i32 %res, ptr %addr.2 ret ptr %addr.1 } define ptr @lwib(ptr %base, i32 %a) { -; RV32XTHEADMEMIDX-LABEL: lwib: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lwib a2, (a0), 4, 0 -; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 -; RV32XTHEADMEMIDX-NEXT: sw a1, 4(a0) -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lwib: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lwib a2, (a0), 4, 0 -; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1 -; RV64XTHEADMEMIDX-NEXT: sw a1, 4(a0) -; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i32, ptr %base, i32 1 +; CHECK-LABEL: lwib: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lwib a2, (a0), 4, 0 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: sw a1, 4(a0) +; CHECK-NEXT: ret + %addr = getelementptr i32, ptr %base, iXLen 1 %ld = load i32, ptr %addr - %addr.1 = getelementptr i32, ptr %base, i32 2 + %addr.1 = getelementptr i32, ptr %base, iXLen 2 %res = add i32 %ld, %a store i32 %res, ptr %addr.1 ret ptr %addr @@ -255,10 +173,10 @@ define ptr @lwuia(ptr %base, ptr %addr.2, i64 %a) { ; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2 ; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1) ; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i32, ptr %base, i32 0 + %addr = getelementptr i32, ptr %base, iXLen 0 %ld = load i32, ptr %addr %zext = zext i32 %ld to i64 - %addr.1 = getelementptr i32, ptr %base, i32 -16 + %addr.1 = getelementptr i32, ptr %base, iXLen -16 %res = add i64 %zext, %a store i64 %res, ptr %addr.2 ret ptr %addr.1 @@ -281,7 +199,7 @@ define ptr @lwuib(ptr %base, i64 %a, ptr %addr.1) { ; RV64XTHEADMEMIDX-NEXT: add a1, a3, a1 ; RV64XTHEADMEMIDX-NEXT: sd a1, 0(a2) ; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i32, ptr %base, i32 1 + %addr = getelementptr i32, ptr %base, iXLen 1 %ld = load i32, ptr %addr %zext = zext i32 %ld to i64 %res = add i64 %zext, %a @@ -309,9 +227,9 @@ define ptr @ldia(ptr %base, ptr %addr.2, i64 %a) { ; RV64XTHEADMEMIDX-NEXT: add a2, a3, a2 ; RV64XTHEADMEMIDX-NEXT: sd a2, 0(a1) ; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i64, ptr %base, i64 0 + %addr = getelementptr i64, ptr %base, iXLen 0 %ld = load i64, ptr %addr - %addr.1 = getelementptr i64, ptr %base, i64 -16 + %addr.1 = getelementptr i64, ptr %base, iXLen -16 %res = add i64 %ld, %a store i64 %res, ptr %addr.2 ret ptr %addr.1 @@ -336,117 +254,81 @@ define ptr @ldib(ptr %base, i64 %a) { ; RV64XTHEADMEMIDX-NEXT: add a1, a2, a1 ; RV64XTHEADMEMIDX-NEXT: sd a1, 8(a0) ; RV64XTHEADMEMIDX-NEXT: ret - %addr = getelementptr i64, ptr %base, i64 1 + %addr = getelementptr i64, ptr %base, iXLen 1 %ld = load i64, ptr %addr - %addr.1 = getelementptr i64, ptr %base, i64 2 + %addr.1 = getelementptr i64, ptr %base, iXLen 2 %res = add i64 %ld, %a store i64 %res, ptr %addr.1 ret ptr %addr } define ptr @sbia(ptr %base, i8 %a, i8 %b) { -; RV32XTHEADMEMIDX-LABEL: sbia: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV32XTHEADMEMIDX-NEXT: th.sbia a1, (a0), 1, 0 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: sbia: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV64XTHEADMEMIDX-NEXT: th.sbia a1, (a0), 1, 0 -; RV64XTHEADMEMIDX-NEXT: ret - %addr.1 = getelementptr i8, ptr %base, i8 1 +; CHECK-LABEL: sbia: +; CHECK: # %bb.0: +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: th.sbia a1, (a0), 1, 0 +; CHECK-NEXT: ret + %addr.1 = getelementptr i8, ptr %base, iXLen 1 %res = add i8 %a, %b store i8 %res, ptr %base ret ptr %addr.1 } define ptr @sbib(ptr %base, i8 %a, i8 %b) { -; RV32XTHEADMEMIDX-LABEL: sbib: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV32XTHEADMEMIDX-NEXT: th.sbib a1, (a0), 1, 0 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: sbib: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV64XTHEADMEMIDX-NEXT: th.sbib a1, (a0), 1, 0 -; RV64XTHEADMEMIDX-NEXT: ret - %addr.1 = getelementptr i8, ptr %base, i8 1 +; CHECK-LABEL: sbib: +; CHECK: # %bb.0: +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: th.sbib a1, (a0), 1, 0 +; CHECK-NEXT: ret + %addr.1 = getelementptr i8, ptr %base, iXLen 1 %res = add i8 %a, %b store i8 %res, ptr %addr.1 ret ptr %addr.1 } define ptr @shia(ptr %base, i16 %a, i16 %b) { -; RV32XTHEADMEMIDX-LABEL: shia: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV32XTHEADMEMIDX-NEXT: th.shia a1, (a0), -9, 1 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: shia: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV64XTHEADMEMIDX-NEXT: th.shia a1, (a0), -9, 1 -; RV64XTHEADMEMIDX-NEXT: ret - %addr.1 = getelementptr i16, ptr %base, i16 -9 +; CHECK-LABEL: shia: +; CHECK: # %bb.0: +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: th.shia a1, (a0), -9, 1 +; CHECK-NEXT: ret + %addr.1 = getelementptr i16, ptr %base, iXLen -9 %res = add i16 %a, %b store i16 %res, ptr %base ret ptr %addr.1 } define ptr @shib(ptr %base, i16 %a, i16 %b) { -; RV32XTHEADMEMIDX-LABEL: shib: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV32XTHEADMEMIDX-NEXT: th.shib a1, (a0), 2, 0 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: shib: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV64XTHEADMEMIDX-NEXT: th.shib a1, (a0), 2, 0 -; RV64XTHEADMEMIDX-NEXT: ret - %addr.1 = getelementptr i16, ptr %base, i16 1 +; CHECK-LABEL: shib: +; CHECK: # %bb.0: +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: th.shib a1, (a0), 2, 0 +; CHECK-NEXT: ret + %addr.1 = getelementptr i16, ptr %base, iXLen 1 %res = add i16 %a, %b store i16 %res, ptr %addr.1 ret ptr %addr.1 } define ptr @swia(ptr %base, i32 %a, i32 %b) { -; RV32XTHEADMEMIDX-LABEL: swia: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV32XTHEADMEMIDX-NEXT: th.swia a1, (a0), 8, 2 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: swia: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV64XTHEADMEMIDX-NEXT: th.swia a1, (a0), 8, 2 -; RV64XTHEADMEMIDX-NEXT: ret - %addr.1 = getelementptr i32, ptr %base, i32 8 +; CHECK-LABEL: swia: +; CHECK: # %bb.0: +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: th.swia a1, (a0), 8, 2 +; CHECK-NEXT: ret + %addr.1 = getelementptr i32, ptr %base, iXLen 8 %res = add i32 %a, %b store i32 %res, ptr %base ret ptr %addr.1 } define ptr @swib(ptr %base, i32 %a, i32 %b) { -; RV32XTHEADMEMIDX-LABEL: swib: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV32XTHEADMEMIDX-NEXT: th.swib a1, (a0), -13, 3 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: swib: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV64XTHEADMEMIDX-NEXT: th.swib a1, (a0), -13, 3 -; RV64XTHEADMEMIDX-NEXT: ret - %addr.1 = getelementptr i32, ptr %base, i32 -26 +; CHECK-LABEL: swib: +; CHECK: # %bb.0: +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: th.swib a1, (a0), -13, 3 +; CHECK-NEXT: ret + %addr.1 = getelementptr i32, ptr %base, iXLen -26 %res = add i32 %a, %b store i32 %res, ptr %addr.1 ret ptr %addr.1 @@ -470,7 +352,7 @@ define ptr @sdia(ptr %base, i64 %a, i64 %b) { ; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2 ; RV64XTHEADMEMIDX-NEXT: th.sdia a1, (a0), 8, 3 ; RV64XTHEADMEMIDX-NEXT: ret - %addr.1 = getelementptr i64, ptr %base, i64 8 + %addr.1 = getelementptr i64, ptr %base, iXLen 8 %res = add i64 %a, %b store i64 %res, ptr %base ret ptr %addr.1 @@ -492,48 +374,33 @@ define ptr @sdib(ptr %base, i64 %a, i64 %b) { ; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2 ; RV64XTHEADMEMIDX-NEXT: th.sdib a1, (a0), 8, 0 ; RV64XTHEADMEMIDX-NEXT: ret - %addr.1 = getelementptr i64, ptr %base, i64 1 + %addr.1 = getelementptr i64, ptr %base, iXLen 1 %res = add i64 %a, %b store i64 %res, ptr %addr.1 ret ptr %addr.1 } -define i8 @lrb_anyext(ptr %a, i64 %b) { -; RV32XTHEADMEMIDX-LABEL: lrb_anyext: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lrb_anyext: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0 -; RV64XTHEADMEMIDX-NEXT: ret - %1 = getelementptr i8, ptr %a, i64 %b +define i8 @lrb_anyext(ptr %a, iXLen %b) { +; CHECK-LABEL: lrb_anyext: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lrb a0, a0, a1, 0 +; CHECK-NEXT: ret + %1 = getelementptr i8, ptr %a, iXLen %b %2 = load i8, ptr %1, align 1 ret i8 %2 } -define i64 @lrb(ptr %a, i64 %b) { -; RV32XTHEADMEMIDX-LABEL: lrb: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrb a1, a0, a1, 0 -; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31 -; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2 -; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lrb: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0 -; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0 -; RV64XTHEADMEMIDX-NEXT: ret - %1 = getelementptr i8, ptr %a, i64 %b +define i32 @lrb(ptr %a, iXLen %b) { +; CHECK-LABEL: lrb: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lrb a0, a0, a1, 0 +; CHECK-NEXT: add a0, a0, a0 +; CHECK-NEXT: ret + %1 = getelementptr i8, ptr %a, iXLen %b %2 = load i8, ptr %1, align 1 - %3 = sext i8 %2 to i64 - %4 = add i64 %3, %3 - ret i64 %4 + %3 = sext i8 %2 to i32 + %4 = add i32 %3, %3 + ret i32 %4 } define i8 @lurb_anyext(ptr %a, i32 %b) { @@ -552,15 +419,11 @@ define i8 @lurb_anyext(ptr %a, i32 %b) { ret i8 %3 } -define i64 @lurb(ptr %a, i32 %b) { +define i32 @lurb(ptr %a, i32 %b) { ; RV32XTHEADMEMIDX-LABEL: lurb: ; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrb a1, a0, a1, 0 -; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31 -; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2 -; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 +; RV32XTHEADMEMIDX-NEXT: th.lrb a0, a0, a1, 0 +; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0 ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: lurb: @@ -571,37 +434,29 @@ define i64 @lurb(ptr %a, i32 %b) { %1 = zext i32 %b to i64 %2 = getelementptr i8, ptr %a, i64 %1 %3 = load i8, ptr %2, align 1 - %4 = sext i8 %3 to i64 - %5 = add i64 %4, %4 - ret i64 %5 -} - -define i64 @lrbu(ptr %a, i64 %b) { -; RV32XTHEADMEMIDX-LABEL: lrbu: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrbu a1, a0, a1, 0 -; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lrbu: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lrbu a0, a0, a1, 0 -; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0 -; RV64XTHEADMEMIDX-NEXT: ret - %1 = getelementptr i8, ptr %a, i64 %b + %4 = sext i8 %3 to i32 + %5 = add i32 %4, %4 + ret i32 %5 +} + +define i32 @lrbu(ptr %a, iXLen %b) { +; CHECK-LABEL: lrbu: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lrbu a0, a0, a1, 0 +; CHECK-NEXT: add a0, a0, a0 +; CHECK-NEXT: ret + %1 = getelementptr i8, ptr %a, iXLen %b %2 = load i8, ptr %1, align 1 - %3 = zext i8 %2 to i64 - %4 = add i64 %3, %3 - ret i64 %4 + %3 = zext i8 %2 to i32 + %4 = add i32 %3, %3 + ret i32 %4 } -define i64 @lurbu(ptr %a, i32 %b) { +define i32 @lurbu(ptr %a, i32 %b) { ; RV32XTHEADMEMIDX-LABEL: lurbu: ; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrbu a1, a0, a1, 0 -; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 +; RV32XTHEADMEMIDX-NEXT: th.lrbu a0, a0, a1, 0 +; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0 ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: lurbu: @@ -612,47 +467,32 @@ define i64 @lurbu(ptr %a, i32 %b) { %1 = zext i32 %b to i64 %2 = getelementptr i8, ptr %a, i64 %1 %3 = load i8, ptr %2, align 1 - %4 = zext i8 %3 to i64 - %5 = add i64 %4, %4 - ret i64 %5 + %4 = zext i8 %3 to i32 + %5 = add i32 %4, %4 + ret i32 %5 } -define i16 @lrh_anyext(ptr %a, i64 %b) { -; RV32XTHEADMEMIDX-LABEL: lrh_anyext: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lrh_anyext: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1 -; RV64XTHEADMEMIDX-NEXT: ret - %1 = getelementptr i16, ptr %a, i64 %b +define i16 @lrh_anyext(ptr %a, iXLen %b) { +; CHECK-LABEL: lrh_anyext: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lrh a0, a0, a1, 1 +; CHECK-NEXT: ret + %1 = getelementptr i16, ptr %a, iXLen %b %2 = load i16, ptr %1, align 2 ret i16 %2 } -define i64 @lrh(ptr %a, i64 %b) { -; RV32XTHEADMEMIDX-LABEL: lrh: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrh a1, a0, a1, 1 -; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31 -; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2 -; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lrh: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1 -; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0 -; RV64XTHEADMEMIDX-NEXT: ret - %1 = getelementptr i16, ptr %a, i64 %b +define i32 @lrh(ptr %a, iXLen %b) { +; CHECK-LABEL: lrh: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lrh a0, a0, a1, 1 +; CHECK-NEXT: add a0, a0, a0 +; CHECK-NEXT: ret + %1 = getelementptr i16, ptr %a, iXLen %b %2 = load i16, ptr %1, align 2 - %3 = sext i16 %2 to i64 - %4 = add i64 %3, %3 - ret i64 %4 + %3 = sext i16 %2 to i32 + %4 = add i32 %3, %3 + ret i32 %4 } define i16 @lurh_anyext(ptr %a, i32 %b) { @@ -671,15 +511,11 @@ define i16 @lurh_anyext(ptr %a, i32 %b) { ret i16 %3 } -define i64 @lurh(ptr %a, i32 %b) { +define i32 @lurh(ptr %a, i32 %b) { ; RV32XTHEADMEMIDX-LABEL: lurh: ; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrh a1, a0, a1, 1 -; RV32XTHEADMEMIDX-NEXT: srai a2, a1, 31 -; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2 -; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 +; RV32XTHEADMEMIDX-NEXT: th.lrh a0, a0, a1, 1 +; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0 ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: lurh: @@ -690,37 +526,29 @@ define i64 @lurh(ptr %a, i32 %b) { %1 = zext i32 %b to i64 %2 = getelementptr i16, ptr %a, i64 %1 %3 = load i16, ptr %2, align 2 - %4 = sext i16 %3 to i64 - %5 = add i64 %4, %4 - ret i64 %5 -} - -define i64 @lrhu(ptr %a, i64 %b) { -; RV32XTHEADMEMIDX-LABEL: lrhu: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrhu a1, a0, a1, 1 -; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lrhu: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lrhu a0, a0, a1, 1 -; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0 -; RV64XTHEADMEMIDX-NEXT: ret - %1 = getelementptr i16, ptr %a, i64 %b + %4 = sext i16 %3 to i32 + %5 = add i32 %4, %4 + ret i32 %5 +} + +define i32 @lrhu(ptr %a, iXLen %b) { +; CHECK-LABEL: lrhu: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lrhu a0, a0, a1, 1 +; CHECK-NEXT: add a0, a0, a0 +; CHECK-NEXT: ret + %1 = getelementptr i16, ptr %a, iXLen %b %2 = load i16, ptr %1, align 2 - %3 = zext i16 %2 to i64 - %4 = add i64 %3, %3 - ret i64 %4 + %3 = zext i16 %2 to i32 + %4 = add i32 %3, %3 + ret i32 %4 } -define i64 @lurhu(ptr %a, i32 %b) { +define i32 @lurhu(ptr %a, i32 %b) { ; RV32XTHEADMEMIDX-LABEL: lurhu: ; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrhu a1, a0, a1, 1 -; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 +; RV32XTHEADMEMIDX-NEXT: th.lrhu a0, a0, a1, 1 +; RV32XTHEADMEMIDX-NEXT: add a0, a0, a0 ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: lurhu: @@ -731,27 +559,22 @@ define i64 @lurhu(ptr %a, i32 %b) { %1 = zext i32 %b to i64 %2 = getelementptr i16, ptr %a, i64 %1 %3 = load i16, ptr %2, align 2 - %4 = zext i16 %3 to i64 - %5 = add i64 %4, %4 - ret i64 %5 + %4 = zext i16 %3 to i32 + %5 = add i32 %4, %4 + ret i32 %5 } -define i32 @lrw_anyext(ptr %a, i64 %b) { -; RV32XTHEADMEMIDX-LABEL: lrw_anyext: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: th.lrw a0, a0, a1, 2 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: lrw_anyext: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: th.lrw a0, a0, a1, 2 -; RV64XTHEADMEMIDX-NEXT: ret - %1 = getelementptr i32, ptr %a, i64 %b +define i32 @lrw_anyext(ptr %a, iXLen %b) { +; CHECK-LABEL: lrw_anyext: +; CHECK: # %bb.0: +; CHECK-NEXT: th.lrw a0, a0, a1, 2 +; CHECK-NEXT: ret + %1 = getelementptr i32, ptr %a, iXLen %b %2 = load i32, ptr %1, align 4 ret i32 %2 } -define i64 @lrw(ptr %a, i64 %b) { +define i64 @lrw(ptr %a, iXLen %b) { ; RV32XTHEADMEMIDX-LABEL: lrw: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 2 @@ -767,7 +590,7 @@ define i64 @lrw(ptr %a, i64 %b) { ; RV64XTHEADMEMIDX-NEXT: th.lrw a0, a0, a1, 2 ; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0 ; RV64XTHEADMEMIDX-NEXT: ret - %1 = getelementptr i32, ptr %a, i64 %b + %1 = getelementptr i32, ptr %a, iXLen %b %2 = load i32, ptr %1, align 4 %3 = sext i32 %2 to i64 %4 = add i64 %3, %3 @@ -814,7 +637,7 @@ define i64 @lurw(ptr %a, i32 %b) { ret i64 %5 } -define i64 @lrwu(ptr %a, i64 %b) { +define i64 @lrwu(ptr %a, iXLen %b) { ; RV32XTHEADMEMIDX-LABEL: lrwu: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 2 @@ -827,7 +650,7 @@ define i64 @lrwu(ptr %a, i64 %b) { ; RV64XTHEADMEMIDX-NEXT: th.lrwu a0, a0, a1, 2 ; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0 ; RV64XTHEADMEMIDX-NEXT: ret - %1 = getelementptr i32, ptr %a, i64 %b + %1 = getelementptr i32, ptr %a, iXLen %b %2 = load i32, ptr %1, align 4 %3 = zext i32 %2 to i64 %4 = add i64 %3, %3 @@ -855,7 +678,7 @@ define i64 @lurwu(ptr %a, i32 %b) { ret i64 %5 } -define i64 @lrd(ptr %a, i64 %b) { +define i64 @lrd(ptr %a, iXLen %b) { ; RV32XTHEADMEMIDX-LABEL: lrd: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: th.lrw a2, a0, a1, 3 @@ -872,13 +695,13 @@ define i64 @lrd(ptr %a, i64 %b) { ; RV64XTHEADMEMIDX-NEXT: th.lrd a0, a0, a1, 3 ; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0 ; RV64XTHEADMEMIDX-NEXT: ret - %1 = getelementptr i64, ptr %a, i64 %b + %1 = getelementptr i64, ptr %a, iXLen %b %2 = load i64, ptr %1, align 8 %3 = add i64 %2, %2 ret i64 %3 } -define i64 @lrd_2(ptr %a, i64 %b) { +define i64 @lrd_2(ptr %a, iXLen %b) { ; RV32XTHEADMEMIDX-LABEL: lrd_2: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: addi a2, a0, 96 @@ -897,8 +720,8 @@ define i64 @lrd_2(ptr %a, i64 %b) { ; RV64XTHEADMEMIDX-NEXT: th.lrd a0, a0, a1, 3 ; RV64XTHEADMEMIDX-NEXT: add a0, a0, a0 ; RV64XTHEADMEMIDX-NEXT: ret - %1 = add i64 %b, 12 - %2 = getelementptr i64, ptr %a, i64 %1 + %1 = add iXLen %b, 12 + %2 = getelementptr i64, ptr %a, iXLen %1 %3 = load i64, ptr %2, align 8 %4 = add i64 %3, %3 ret i64 %4 @@ -928,20 +751,14 @@ define i64 @lurd(ptr %a, i32 %b) { ret i64 %4 } -define void @srb(ptr %a, i64 %b, i8 %c) { -; RV32XTHEADMEMIDX-LABEL: srb: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3 -; RV32XTHEADMEMIDX-NEXT: th.srb a3, a0, a1, 0 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: srb: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: add a2, a2, a2 -; RV64XTHEADMEMIDX-NEXT: th.srb a2, a0, a1, 0 -; RV64XTHEADMEMIDX-NEXT: ret +define void @srb(ptr %a, iXLen %b, i8 %c) { +; CHECK-LABEL: srb: +; CHECK: # %bb.0: +; CHECK-NEXT: add a2, a2, a2 +; CHECK-NEXT: th.srb a2, a0, a1, 0 +; CHECK-NEXT: ret %1 = add i8 %c, %c - %2 = getelementptr i8, ptr %a, i64 %b + %2 = getelementptr i8, ptr %a, iXLen %b store i8 %1, ptr %2, align 1 ret void } @@ -965,20 +782,14 @@ define void @surb(ptr %a, i32 %b, i8 %c) { ret void } -define void @srh(ptr %a, i64 %b, i16 %c) { -; RV32XTHEADMEMIDX-LABEL: srh: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3 -; RV32XTHEADMEMIDX-NEXT: th.srh a3, a0, a1, 1 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: srh: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: add a2, a2, a2 -; RV64XTHEADMEMIDX-NEXT: th.srh a2, a0, a1, 1 -; RV64XTHEADMEMIDX-NEXT: ret +define void @srh(ptr %a, iXLen %b, i16 %c) { +; CHECK-LABEL: srh: +; CHECK: # %bb.0: +; CHECK-NEXT: add a2, a2, a2 +; CHECK-NEXT: th.srh a2, a0, a1, 1 +; CHECK-NEXT: ret %1 = add i16 %c, %c - %2 = getelementptr i16, ptr %a, i64 %b + %2 = getelementptr i16, ptr %a, iXLen %b store i16 %1, ptr %2, align 2 ret void } @@ -1002,20 +813,14 @@ define void @surh(ptr %a, i32 %b, i16 %c) { ret void } -define void @srw(ptr %a, i64 %b, i32 %c) { -; RV32XTHEADMEMIDX-LABEL: srw: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3 -; RV32XTHEADMEMIDX-NEXT: th.srw a3, a0, a1, 2 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: srw: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: add a2, a2, a2 -; RV64XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 2 -; RV64XTHEADMEMIDX-NEXT: ret +define void @srw(ptr %a, iXLen %b, i32 %c) { +; CHECK-LABEL: srw: +; CHECK: # %bb.0: +; CHECK-NEXT: add a2, a2, a2 +; CHECK-NEXT: th.srw a2, a0, a1, 2 +; CHECK-NEXT: ret %1 = add i32 %c, %c - %2 = getelementptr i32, ptr %a, i64 %b + %2 = getelementptr i32, ptr %a, iXLen %b store i32 %1, ptr %2, align 4 ret void } @@ -1039,16 +844,16 @@ define void @surw(ptr %a, i32 %b, i32 %c) { ret void } -define void @srd(ptr %a, i64 %b, i64 %c) { +define void @srd(ptr %a, iXLen %b, i64 %c) { ; RV32XTHEADMEMIDX-LABEL: srd: ; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a2, a3, a3 -; RV32XTHEADMEMIDX-NEXT: add a4, a4, a4 -; RV32XTHEADMEMIDX-NEXT: sltu a3, a2, a3 -; RV32XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: add a3, a4, a3 +; RV32XTHEADMEMIDX-NEXT: add a4, a2, a2 +; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3 +; RV32XTHEADMEMIDX-NEXT: sltu a2, a4, a2 +; RV32XTHEADMEMIDX-NEXT: th.srw a4, a0, a1, 3 +; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2 ; RV32XTHEADMEMIDX-NEXT: addi a0, a0, 4 -; RV32XTHEADMEMIDX-NEXT: th.srw a3, a0, a1, 3 +; RV32XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 3 ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: srd: @@ -1057,7 +862,7 @@ define void @srd(ptr %a, i64 %b, i64 %c) { ; RV64XTHEADMEMIDX-NEXT: th.srd a2, a0, a1, 3 ; RV64XTHEADMEMIDX-NEXT: ret %1 = add i64 %c, %c - %2 = getelementptr i64, ptr %a, i64 %b + %2 = getelementptr i64, ptr %a, iXLen %b store i64 %1, ptr %2, align 8 ret void } @@ -1087,24 +892,18 @@ define void @surd(ptr %a, i32 %b, i64 %c) { } define ptr @test_simm5(ptr %base, i32 %a, i32 %b) { -; RV32XTHEADMEMIDX-LABEL: test_simm5: -; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV32XTHEADMEMIDX-NEXT: th.swia a1, (a0), -12, 2 -; RV32XTHEADMEMIDX-NEXT: ret -; -; RV64XTHEADMEMIDX-LABEL: test_simm5: -; RV64XTHEADMEMIDX: # %bb.0: -; RV64XTHEADMEMIDX-NEXT: add a1, a1, a2 -; RV64XTHEADMEMIDX-NEXT: th.swia a1, (a0), -12, 2 -; RV64XTHEADMEMIDX-NEXT: ret +; CHECK-LABEL: test_simm5: +; CHECK: # %bb.0: +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: th.swia a1, (a0), -12, 2 +; CHECK-NEXT: ret %addr.1 = getelementptr i32, ptr %base, i32 -12 %res = add i32 %a, %b store i32 %res, ptr %base ret ptr %addr.1 } -define i64 @lrd_large_shift(ptr %a, i64 %b) { +define i64 @lrd_large_shift(ptr %a, iXLen %b) { ; RV32XTHEADMEMIDX-LABEL: lrd_large_shift: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 5 @@ -1119,14 +918,14 @@ define i64 @lrd_large_shift(ptr %a, i64 %b) { ; RV64XTHEADMEMIDX-NEXT: add a0, a1, a0 ; RV64XTHEADMEMIDX-NEXT: ld a0, 384(a0) ; RV64XTHEADMEMIDX-NEXT: ret - %1 = add i64 %b, 12 - %2 = shl i64 %1, 2 - %3 = getelementptr i64, ptr %a, i64 %2 + %1 = add iXLen %b, 12 + %2 = shl iXLen %1, 2 + %3 = getelementptr i64, ptr %a, iXLen %2 %4 = load i64, ptr %3, align 8 ret i64 %4 } -define i64 @lrd_large_offset(ptr %a, i64 %b) { +define i64 @lrd_large_offset(ptr %a, iXLen %b) { ; RV32XTHEADMEMIDX-LABEL: lrd_large_offset: ; RV32XTHEADMEMIDX: # %bb.0: ; RV32XTHEADMEMIDX-NEXT: slli a1, a1, 3 @@ -1145,8 +944,8 @@ define i64 @lrd_large_offset(ptr %a, i64 %b) { ; RV64XTHEADMEMIDX-NEXT: add a0, a0, a1 ; RV64XTHEADMEMIDX-NEXT: ld a0, 1792(a0) ; RV64XTHEADMEMIDX-NEXT: ret - %1 = add i64 %b, 12000 - %2 = getelementptr i64, ptr %a, i64 %1 + %1 = add iXLen %b, 12000 + %2 = getelementptr i64, ptr %a, iXLen %1 %3 = load i64, ptr %2, align 8 ret i64 %3 } diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll index f9db686..1ef37f7 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll @@ -242,7 +242,7 @@ define void @foo7(ptr nocapture %p) nounwind { ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: lui a1, %hi(d) ; RV64ZDINX-NEXT: addi a2, a1, %lo(d) -; RV64ZDINX-NEXT: lwu a2, 8(a2) +; RV64ZDINX-NEXT: lw a2, 8(a2) ; RV64ZDINX-NEXT: lwu a1, %lo(d+4)(a1) ; RV64ZDINX-NEXT: slli a2, a2, 32 ; RV64ZDINX-NEXT: or a1, a2, a1 @@ -337,7 +337,7 @@ define void @foo9(ptr nocapture %p) nounwind { ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: lui a1, %hi(e) ; RV64ZDINX-NEXT: addi a2, a1, %lo(e) -; RV64ZDINX-NEXT: lwu a2, 4(a2) +; RV64ZDINX-NEXT: lw a2, 4(a2) ; RV64ZDINX-NEXT: lwu a1, %lo(e)(a1) ; RV64ZDINX-NEXT: slli a2, a2, 32 ; RV64ZDINX-NEXT: or a1, a2, a1 @@ -480,7 +480,7 @@ define double @foo13(ptr nocapture %p) nounwind { ; RV64ZDINX-LABEL: foo13: ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: lui a0, %hi(f) -; RV64ZDINX-NEXT: lwu a1, %lo(f+8)(a0) +; RV64ZDINX-NEXT: lw a1, %lo(f+8)(a0) ; RV64ZDINX-NEXT: lwu a0, %lo(f+4)(a0) ; RV64ZDINX-NEXT: slli a1, a1, 32 ; RV64ZDINX-NEXT: or a0, a1, a0 diff --git a/llvm/test/CodeGen/SPARC/tls-sp.ll b/llvm/test/CodeGen/SPARC/tls-sp.ll new file mode 100644 index 0000000..de9af01 --- /dev/null +++ b/llvm/test/CodeGen/SPARC/tls-sp.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=sparc -relocation-model=pic < %s | FileCheck --check-prefix=SPARC %s +; RUN: llc -mtriple=sparc64 -relocation-model=pic < %s | FileCheck --check-prefix=SPARC64 %s + +@x = external thread_local global i8 + +;; Test that we don't over-allocate stack space when calling __tls_get_addr +;; with the call frame pseudos able to be eliminated. +define ptr @no_alloca() nounwind { +; SPARC-LABEL: no_alloca: +; SPARC: ! %bb.0: ! %entry +; SPARC-NEXT: save %sp, -96, %sp +; SPARC-NEXT: .Ltmp0: +; SPARC-NEXT: call .Ltmp1 +; SPARC-NEXT: .Ltmp2: +; SPARC-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0 +; SPARC-NEXT: .Ltmp1: +; SPARC-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0 +; SPARC-NEXT: add %i0, %o7, %i0 +; SPARC-NEXT: sethi %tgd_hi22(x), %i1 +; SPARC-NEXT: add %i1, %tgd_lo10(x), %i1 +; SPARC-NEXT: add %i0, %i1, %o0, %tgd_add(x) +; SPARC-NEXT: call __tls_get_addr, %tgd_call(x) +; SPARC-NEXT: nop +; SPARC-NEXT: ret +; SPARC-NEXT: restore %g0, %o0, %o0 +; +; SPARC64-LABEL: no_alloca: +; SPARC64: ! %bb.0: ! %entry +; SPARC64-NEXT: save %sp, -128, %sp +; SPARC64-NEXT: .Ltmp0: +; SPARC64-NEXT: rd %pc, %o7 +; SPARC64-NEXT: .Ltmp2: +; SPARC64-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0 +; SPARC64-NEXT: .Ltmp1: +; SPARC64-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0 +; SPARC64-NEXT: add %i0, %o7, %i0 +; SPARC64-NEXT: sethi %tgd_hi22(x), %i1 +; SPARC64-NEXT: add %i1, %tgd_lo10(x), %i1 +; SPARC64-NEXT: add %i0, %i1, %o0, %tgd_add(x) +; SPARC64-NEXT: call __tls_get_addr, %tgd_call(x) +; SPARC64-NEXT: nop +; SPARC64-NEXT: ret +; SPARC64-NEXT: restore %g0, %o0, %o0 +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @x) + ret ptr %0 +} + +;; Test that %sp is valid for the call to __tls_get_addr. We store to a dynamic +;; alloca in order to prevent eliminating any call frame pseudos from the call. +define ptr @dynamic_alloca(i64 %n) nounwind { +; SPARC-LABEL: dynamic_alloca: +; SPARC: ! %bb.0: ! %entry +; SPARC-NEXT: save %sp, -96, %sp +; SPARC-NEXT: .Ltmp3: +; SPARC-NEXT: call .Ltmp4 +; SPARC-NEXT: .Ltmp5: +; SPARC-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i0 +; SPARC-NEXT: .Ltmp4: +; SPARC-NEXT: or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i0 +; SPARC-NEXT: add %i0, %o7, %i0 +; SPARC-NEXT: sethi %tgd_hi22(x), %i2 +; SPARC-NEXT: add %i2, %tgd_lo10(x), %i2 +; SPARC-NEXT: add %i0, %i2, %o0, %tgd_add(x) +; SPARC-NEXT: call __tls_get_addr, %tgd_call(x) +; SPARC-NEXT: nop +; SPARC-NEXT: add %i1, 7, %i0 +; SPARC-NEXT: and %i0, -8, %i0 +; SPARC-NEXT: sub %sp, %i0, %i0 +; SPARC-NEXT: add %i0, -8, %sp +; SPARC-NEXT: mov 1, %i1 +; SPARC-NEXT: stb %i1, [%i0+88] +; SPARC-NEXT: ret +; SPARC-NEXT: restore %g0, %o0, %o0 +; +; SPARC64-LABEL: dynamic_alloca: +; SPARC64: ! %bb.0: ! %entry +; SPARC64-NEXT: save %sp, -128, %sp +; SPARC64-NEXT: .Ltmp3: +; SPARC64-NEXT: rd %pc, %o7 +; SPARC64-NEXT: .Ltmp5: +; SPARC64-NEXT: sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i1 +; SPARC64-NEXT: .Ltmp4: +; SPARC64-NEXT: or %i1, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i1 +; SPARC64-NEXT: add %i1, %o7, %i1 +; SPARC64-NEXT: sethi %tgd_hi22(x), %i2 +; SPARC64-NEXT: add %i2, %tgd_lo10(x), %i2 +; SPARC64-NEXT: add %i1, %i2, %o0, %tgd_add(x) +; SPARC64-NEXT: call __tls_get_addr, %tgd_call(x) +; SPARC64-NEXT: nop +; SPARC64-NEXT: add %i0, 15, %i0 +; SPARC64-NEXT: and %i0, -16, %i0 +; SPARC64-NEXT: sub %sp, %i0, %i0 +; SPARC64-NEXT: mov %i0, %sp +; SPARC64-NEXT: mov 1, %i1 +; SPARC64-NEXT: stb %i1, [%i0+2175] +; SPARC64-NEXT: ret +; SPARC64-NEXT: restore %g0, %o0, %o0 +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @x) + %1 = alloca i8, i64 %n + store i8 1, ptr %1 + ret ptr %0 +} diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll index 3d46b52..70030ca 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fp-intrinsics.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std" @@ -337,3 +338,68 @@ entry: } declare float @llvm.fma.f32(float, float, float) + +; CHECK: OpFunction +; CHECK: %[[#d:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#fracPtr:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#integralPtr:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#varPtr:]] = OpVariable %[[#]] Function +; CHECK: %[[#frac:]] = OpExtInst %[[#var2]] %[[#extinst_id]] modf %[[#d]] %[[#varPtr]] +; CHECK: %[[#integral:]] = OpLoad %[[#var2]] %[[#varPtr]] +; CHECK: OpStore %[[#fracPtr]] %[[#frac]] +; CHECK: OpStore %[[#integralPtr]] %[[#integral]] +; CHECK: OpFunctionEnd +define void @TestModf(double %d, ptr addrspace(1) %frac, ptr addrspace(1) %integral) { +entry: + %4 = tail call { double, double } @llvm.modf.f64(double %d) + %5 = extractvalue { double, double } %4, 0 + %6 = extractvalue { double, double } %4, 1 + store double %5, ptr addrspace(1) %frac, align 8 + store double %6, ptr addrspace(1) %integral, align 8 + ret void +} + +; CHECK: OpFunction +; CHECK: %[[#d:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#fracPtr:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#integralPtr:]] = OpFunctionParameter %[[#]] +; CHECK: %[[#entryBlock:]] = OpLabel +; CHECK: %[[#varPtr:]] = OpVariable %[[#]] Function +; CHECK: OpBranchConditional %[[#]] %[[#lor_lhs_falseBlock:]] %[[#if_thenBlock:]] +; CHECK: %[[#lor_lhs_falseBlock]] = OpLabel +; CHECK: OpBranchConditional %[[#]] %[[#if_endBlock:]] %[[#if_thenBlock]] +; CHECK: %[[#if_thenBlock]] = OpLabel +; CHECK: OpBranch %[[#returnBlock:]] +; CHECK: %[[#if_endBlock]] = OpLabel +; CHECK: %[[#frac:]] = OpExtInst %[[#var2]] %[[#extinst_id]] modf %[[#d]] %[[#varPtr]] +; CHECK: %[[#integral:]] = OpLoad %[[#var2]] %[[#varPtr]] +; CHECK: OpStore %[[#fracPtr]] %[[#frac]] +; CHECK: OpStore %[[#integralPtr]] %[[#integral]] +; CHECK: OpFunctionEnd +define dso_local void @TestModf2(double noundef %d, ptr noundef %frac, ptr noundef %integral) { +entry: + %0 = load ptr, ptr %frac, align 8 + %tobool = icmp ne ptr %0, null + br i1 %tobool, label %lor.lhs.false, label %if.then + +lor.lhs.false: + %1 = load ptr, ptr %integral, align 8 + %tobool1 = icmp ne ptr %1, null + br i1 %tobool1, label %if.end, label %if.then + +if.then: + br label %return + +if.end: + %6 = tail call { double, double } @llvm.modf.f64(double %d) + %7 = extractvalue { double, double } %6, 0 + %8 = extractvalue { double, double } %6, 1 + store double %7, ptr %frac, align 4 + store double %8, ptr %integral, align 4 + br label %return + +return: + ret void +} + +declare { double, double } @llvm.modf.f64(double) diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll index bbf4d50..8a6a303 100644 --- a/llvm/test/CodeGen/SystemZ/pr60413.ll +++ b/llvm/test/CodeGen/SystemZ/pr60413.ll @@ -16,31 +16,31 @@ define dso_local void @m() local_unnamed_addr #1 { ; CHECK-NEXT: stmg %r13, %r15, 104(%r15) ; CHECK-NEXT: aghi %r15, -168 ; CHECK-NEXT: lhrl %r1, f+4 +; CHECK-NEXT: sll %r1, 8 ; CHECK-NEXT: larl %r2, f -; CHECK-NEXT: llc %r2, 6(%r2) -; CHECK-NEXT: larl %r3, e -; CHECK-NEXT: lb %r0, 3(%r3) -; CHECK-NEXT: rosbg %r2, %r1, 32, 55, 8 -; CHECK-NEXT: vlvgp %v0, %r2, %r0 -; CHECK-NEXT: vlvgf %v0, %r2, 0 -; CHECK-NEXT: vlvgf %v0, %r2, 2 -; CHECK-NEXT: vlvgp %v1, %r0, %r2 -; CHECK-NEXT: vlvgp %v2, %r2, %r2 -; CHECK-NEXT: lr %r1, %r2 +; CHECK-NEXT: ic %r1, 6(%r2) +; CHECK-NEXT: larl %r2, e +; CHECK-NEXT: lb %r0, 3(%r2) +; CHECK-NEXT: vlvgp %v0, %r0, %r1 +; CHECK-NEXT: vlvgp %v1, %r1, %r0 +; CHECK-NEXT: vlvgf %v1, %r1, 0 +; CHECK-NEXT: vlvgf %v1, %r1, 2 +; CHECK-NEXT: vlvgp %v2, %r1, %r1 +; CHECK-NEXT: # kill: def $r1l killed $r1l killed $r1d ; CHECK-NEXT: nilh %r1, 255 ; CHECK-NEXT: chi %r1, 128 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: risbg %r1, %r1, 63, 191, 36 +; CHECK-NEXT: vlvgf %v0, %r0, 0 +; CHECK-NEXT: vlvgf %v0, %r0, 2 ; CHECK-NEXT: vgbm %v3, 30583 ; CHECK-NEXT: vn %v0, %v0, %v3 -; CHECK-NEXT: vlvgf %v1, %r0, 0 -; CHECK-NEXT: vlvgf %v1, %r0, 2 ; CHECK-NEXT: vn %v1, %v1, %v3 ; CHECK-NEXT: vrepf %v2, %v2, 1 ; CHECK-NEXT: vn %v2, %v2, %v3 ; CHECK-NEXT: vrepif %v3, 127 -; CHECK-NEXT: vchlf %v0, %v0, %v3 -; CHECK-NEXT: vlgvf %r13, %v0, 0 +; CHECK-NEXT: vchlf %v1, %v1, %v3 +; CHECK-NEXT: vlgvf %r13, %v1, 0 ; CHECK-NEXT: vchlf %v2, %v2, %v3 ; CHECK-NEXT: vlgvf %r3, %v2, 1 ; CHECK-NEXT: nilf %r3, 1 @@ -54,13 +54,13 @@ define dso_local void @m() local_unnamed_addr #1 { ; CHECK-NEXT: nilf %r14, 1 ; CHECK-NEXT: rosbg %r2, %r14, 32, 51, 12 ; CHECK-NEXT: rosbg %r2, %r13, 52, 52, 11 -; CHECK-NEXT: vlgvf %r13, %v0, 1 +; CHECK-NEXT: vlgvf %r13, %v1, 1 ; CHECK-NEXT: rosbg %r2, %r13, 53, 53, 10 -; CHECK-NEXT: vlgvf %r13, %v0, 2 +; CHECK-NEXT: vlgvf %r13, %v1, 2 ; CHECK-NEXT: rosbg %r2, %r13, 54, 54, 9 -; CHECK-NEXT: vlgvf %r13, %v0, 3 +; CHECK-NEXT: vlgvf %r13, %v1, 3 ; CHECK-NEXT: rosbg %r2, %r13, 55, 55, 8 -; CHECK-NEXT: vchlf %v0, %v1, %v3 +; CHECK-NEXT: vchlf %v0, %v0, %v3 ; CHECK-NEXT: vlgvf %r13, %v0, 0 ; CHECK-NEXT: rosbg %r2, %r13, 56, 56, 7 ; CHECK-NEXT: vlgvf %r13, %v0, 1 diff --git a/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll b/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll index 9acdd7e..b70505c 100644 --- a/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll +++ b/llvm/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll @@ -17,6 +17,7 @@ declare void @_ZNSsC1EPKcRKSaIcE() unnamed_addr #0 ; CHECK: .LBB0_2 ; Function Attrs: nounwind define hidden void @_ZN4llvm14DOTGraphTraitsIPNS_13ScheduleDAGMIEE17getEdgeAttributesEPKNS_5SUnitENS_13SUnitIteratorEPKNS_11ScheduleDAGE() #0 align 2 { + %a = alloca i8 br i1 undef, label %1, label %2 ; <label>:1: ; preds = %0 @@ -25,7 +26,7 @@ define hidden void @_ZN4llvm14DOTGraphTraitsIPNS_13ScheduleDAGMIEE17getEdgeAttri br label %3 ; <label>:2: ; preds = %0 - call void @llvm.lifetime.start.p0(i64 1, ptr undef) #0 + call void @llvm.lifetime.start.p0(i64 1, ptr %a) #0 call void @_ZNSaIcEC2Ev() #0 br label %3 diff --git a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll new file mode 100644 index 0000000..8030438 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +declare i32 @memcmp(ptr, ptr, i32) + +define i1 @memcmp_expand_3(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_3: +; CHECK: .functype memcmp_expand_3 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.load16_u $push7=, 0($0):p2align=0 +; CHECK-NEXT: i32.load16_u $push6=, 0($1):p2align=0 +; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6 +; CHECK-NEXT: i32.const $push0=, 2 +; CHECK-NEXT: i32.add $push3=, $0, $pop0 +; CHECK-NEXT: i32.load8_u $push4=, 0($pop3) +; CHECK-NEXT: i32.const $push13=, 2 +; CHECK-NEXT: i32.add $push1=, $1, $pop13 +; CHECK-NEXT: i32.load8_u $push2=, 0($pop1) +; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2 +; CHECK-NEXT: i32.or $push9=, $pop8, $pop5 +; CHECK-NEXT: i32.const $push10=, 65535 +; CHECK-NEXT: i32.and $push11=, $pop9, $pop10 +; CHECK-NEXT: i32.eqz $push12=, $pop11 +; CHECK-NEXT: return $pop12 + %cmp_3 = call i32 @memcmp(ptr %a, ptr %b, i32 3) + %res = icmp eq i32 %cmp_3, 0 + ret i1 %res +} + +define i1 @memcmp_expand_5(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_5: +; CHECK: .functype memcmp_expand_5 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.load $push7=, 0($0):p2align=0 +; CHECK-NEXT: i32.load $push6=, 0($1):p2align=0 +; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6 +; CHECK-NEXT: i32.const $push0=, 4 +; CHECK-NEXT: i32.add $push3=, $0, $pop0 +; CHECK-NEXT: i32.load8_u $push4=, 0($pop3) +; CHECK-NEXT: i32.const $push11=, 4 +; CHECK-NEXT: i32.add $push1=, $1, $pop11 +; CHECK-NEXT: i32.load8_u $push2=, 0($pop1) +; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2 +; CHECK-NEXT: i32.or $push9=, $pop8, $pop5 +; CHECK-NEXT: i32.eqz $push10=, $pop9 +; CHECK-NEXT: return $pop10 + %cmp_5 = call i32 @memcmp(ptr %a, ptr %b, i32 5) + %res = icmp eq i32 %cmp_5, 0 + ret i1 %res +} + +define i1 @memcmp_expand_7(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_7: +; CHECK: .functype memcmp_expand_7 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.load $push7=, 0($0):p2align=0 +; CHECK-NEXT: i32.load $push6=, 0($1):p2align=0 +; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6 +; CHECK-NEXT: i32.const $push0=, 3 +; CHECK-NEXT: i32.add $push3=, $0, $pop0 +; CHECK-NEXT: i32.load $push4=, 0($pop3):p2align=0 +; CHECK-NEXT: i32.const $push11=, 3 +; CHECK-NEXT: i32.add $push1=, $1, $pop11 +; CHECK-NEXT: i32.load $push2=, 0($pop1):p2align=0 +; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2 +; CHECK-NEXT: i32.or $push9=, $pop8, $pop5 +; CHECK-NEXT: i32.eqz $push10=, $pop9 +; CHECK-NEXT: return $pop10 + %cmp_7 = call i32 @memcmp(ptr %a, ptr %b, i32 7) + %res = icmp eq i32 %cmp_7, 0 + ret i1 %res +} + +; INFO: Negative test +; Should not expand even with simd128 +define i1 @memcmp_expand_129(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_129: +; CHECK: .functype memcmp_expand_129 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const $push0=, 129 +; CHECK-NEXT: call $push1=, memcmp, $0, $1, $pop0 +; CHECK-NEXT: i32.eqz $push2=, $pop1 +; CHECK-NEXT: return $pop2 + %cmp_129 = call i32 @memcmp(ptr %a, ptr %b, i32 129) + %res = icmp eq i32 %cmp_129, 0 + ret i1 %res +} + +define i1 @memcmp_expand_2(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_2: +; CHECK: .functype memcmp_expand_2 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.load16_u $push1=, 0($0):p2align=0 +; CHECK-NEXT: i32.load16_u $push0=, 0($1):p2align=0 +; CHECK-NEXT: i32.eq $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2) + %res = icmp eq i32 %cmp_2, 0 + ret i1 %res +} + +define i1 @memcmp_expand_2_align(ptr align(2) %a, ptr align(2) %b) { +; CHECK-LABEL: memcmp_expand_2_align: +; CHECK: .functype memcmp_expand_2_align (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.load16_u $push1=, 0($0) +; CHECK-NEXT: i32.load16_u $push0=, 0($1) +; CHECK-NEXT: i32.eq $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2) + %res = icmp eq i32 %cmp_2, 0 + ret i1 %res +} + +define i1 @memcmp_expand_8(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_8: +; CHECK: .functype memcmp_expand_8 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i64.load $push1=, 0($0):p2align=0 +; CHECK-NEXT: i64.load $push0=, 0($1):p2align=0 +; CHECK-NEXT: i64.eq $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %cmp_8 = call i32 @memcmp(ptr %a, ptr %b, i32 8) + %res = icmp eq i32 %cmp_8, 0 + ret i1 %res +} + +; TODO: Should be using a single load i64x2 or equivalent in bitsizes +define i1 @memcmp_expand_16(ptr %a, ptr %b) { +; CHECK-LABEL: memcmp_expand_16: +; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i64.load $push7=, 0($0):p2align=0 +; CHECK-NEXT: i64.load $push6=, 0($1):p2align=0 +; CHECK-NEXT: i64.xor $push8=, $pop7, $pop6 +; CHECK-NEXT: i32.const $push0=, 8 +; CHECK-NEXT: i32.add $push3=, $0, $pop0 +; CHECK-NEXT: i64.load $push4=, 0($pop3):p2align=0 +; CHECK-NEXT: i32.const $push11=, 8 +; CHECK-NEXT: i32.add $push1=, $1, $pop11 +; CHECK-NEXT: i64.load $push2=, 0($pop1):p2align=0 +; CHECK-NEXT: i64.xor $push5=, $pop4, $pop2 +; CHECK-NEXT: i64.or $push9=, $pop8, $pop5 +; CHECK-NEXT: i64.eqz $push10=, $pop9 +; CHECK-NEXT: return $pop10 + %cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16) + %res = icmp eq i32 %cmp_16, 0 + ret i1 %res +} diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll new file mode 100644 index 0000000..97c2311 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll @@ -0,0 +1,1413 @@ +; RUN: opt -S -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s + +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" + +%struct.TwoInts = type { i32, i32 } +%struct.ThreeInts = type { i32, i32, i32 } +%struct.FourInts = type { i32, i32, i32, i32 } +%struct.ThreeShorts = type { i16, i16, i16 } +%struct.FourShorts = type { i16, i16, i16, i16 } +%struct.FiveShorts = type { i16, i16, i16, i16, i16 } +%struct.TwoBytes = type { i8, i8 } +%struct.ThreeBytes = type { i8, i8, i8 } +%struct.FourBytes = type { i8, i8, i8, i8 } +%struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 } + +; CHECK-LABEL: two_ints_same_op: +; CHECK: loop +; CHECK: i32.load +; CHECK: i32.load +; CHECK: i32.add +; CHECK: i32.store +; CHECK: i32.load +; CHECK: i32.load +; CHECK: i32.add +; CHECK: i32.store +define hidden void @two_ints_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %21, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.TwoInts, ptr %1, i32 %8 + %10 = load i32, ptr %9, align 4 + %11 = getelementptr inbounds %struct.TwoInts, ptr %2, i32 %8 + %12 = load i32, ptr %11, align 4 + %13 = add i32 %12, %10 + %14 = getelementptr inbounds %struct.TwoInts, ptr %0, i32 %8 + store i32 %13, ptr %14, align 4 + %15 = getelementptr inbounds i8, ptr %9, i32 4 + %16 = load i32, ptr %15, align 4 + %17 = getelementptr inbounds i8, ptr %11, i32 4 + %18 = load i32, ptr %17, align 4 + %19 = add i32 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 4 + store i32 %19, ptr %20, align 4 + %21 = add nuw i32 %8, 1 + %22 = icmp eq i32 %21, %3 + br i1 %22, label %6, label %7 +} + +; CHECK-LABEL: two_ints_vary_op: +; CHECK: loop +; CHECK: i32.load +; CHECK: i32.load +; CHECK: i32.add +; CHECK: i32.store +; CHECK: i32.load +; CHECK: i32.load +; CHECK: i32.sub +; CHECK: i32.store +define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %21, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.TwoInts, ptr %1, i32 %8 + %10 = load i32, ptr %9, align 4 + %11 = getelementptr inbounds %struct.TwoInts, ptr %2, i32 %8 + %12 = load i32, ptr %11, align 4 + %13 = add i32 %12, %10 + %14 = getelementptr inbounds %struct.TwoInts, ptr %0, i32 %8 + store i32 %13, ptr %14, align 4 + %15 = getelementptr inbounds i8, ptr %9, i32 4 + %16 = load i32, ptr %15, align 4 + %17 = getelementptr inbounds i8, ptr %11, i32 4 + %18 = load i32, ptr %17, align 4 + %19 = sub i32 %16, %18 + %20 = getelementptr inbounds i8, ptr %14, i32 4 + store i32 %19, ptr %20, align 4 + %21 = add nuw i32 %8, 1 + %22 = icmp eq i32 %21, %3 + br i1 %22, label %6, label %7 +} + +; CHECK-LABEL: three_ints: +; CHECK: loop +; CHECK: i32.load +; CHECK: i32.load +; CHECK: i32.add +; CHECK: i32.store +; CHECK: i32.load +; CHECK: i32.load +; CHECK: i32.add +; CHECK: i32.store +; CHECK: i32.load +; CHECK: i32.load +; CHECK: i32.add +; CHECK: i32.store +define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %27, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.ThreeInts, ptr %1, i32 %8 + %10 = load i32, ptr %9, align 4 + %11 = getelementptr inbounds %struct.ThreeInts, ptr %2, i32 %8 + %12 = load i32, ptr %11, align 4 + %13 = add nsw i32 %12, %10 + %14 = getelementptr inbounds %struct.ThreeInts, ptr %0, i32 %8 + store i32 %13, ptr %14, align 4 + %15 = getelementptr inbounds i8, ptr %9, i32 4 + %16 = load i32, ptr %15, align 4 + %17 = getelementptr inbounds i8, ptr %11, i32 4 + %18 = load i32, ptr %17, align 4 + %19 = add nsw i32 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 4 + store i32 %19, ptr %20, align 4 + %21 = getelementptr inbounds i8, ptr %9, i32 8 + %22 = load i32, ptr %21, align 4 + %23 = getelementptr inbounds i8, ptr %11, i32 8 + %24 = load i32, ptr %23, align 4 + %25 = add nsw i32 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 8 + store i32 %25, ptr %26, align 4 + %27 = add nuw i32 %8, 1 + %28 = icmp eq i32 %27, %3 + br i1 %28, label %6, label %7 +} + +; CHECK-LABEL: three_shorts: +; CHECK: loop +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.mul +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.mul +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.mul +; CHECK: i32.store16 +define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %27, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.ThreeShorts, ptr %1, i32 %8 + %10 = load i16, ptr %9, align 2 + %11 = getelementptr inbounds %struct.ThreeShorts, ptr %2, i32 %8 + %12 = load i16, ptr %11, align 2 + %13 = mul i16 %12, %10 + %14 = getelementptr inbounds %struct.ThreeShorts, ptr %0, i32 %8 + store i16 %13, ptr %14, align 2 + %15 = getelementptr inbounds i8, ptr %9, i32 2 + %16 = load i16, ptr %15, align 2 + %17 = getelementptr inbounds i8, ptr %11, i32 2 + %18 = load i16, ptr %17, align 2 + %19 = mul i16 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 2 + store i16 %19, ptr %20, align 2 + %21 = getelementptr inbounds i8, ptr %9, i32 4 + %22 = load i16, ptr %21, align 2 + %23 = getelementptr inbounds i8, ptr %11, i32 4 + %24 = load i16, ptr %23, align 2 + %25 = mul i16 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 4 + store i16 %25, ptr %26, align 2 + %27 = add nuw i32 %8, 1 + %28 = icmp eq i32 %27, %3 + br i1 %28, label %6, label %7 +} + +; CHECK-LABEL: four_shorts_same_op: +; CHECK: loop +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.sub +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.sub +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.sub +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.sub +; CHECK: i32.store16 +define hidden void @four_shorts_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %33, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8 + %10 = load i16, ptr %9, align 2 + %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8 + %12 = load i16, ptr %11, align 2 + %13 = sub i16 %10, %12 + %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8 + store i16 %13, ptr %14, align 2 + %15 = getelementptr inbounds i8, ptr %9, i32 2 + %16 = load i16, ptr %15, align 2 + %17 = getelementptr inbounds i8, ptr %11, i32 2 + %18 = load i16, ptr %17, align 2 + %19 = sub i16 %16, %18 + %20 = getelementptr inbounds i8, ptr %14, i32 2 + store i16 %19, ptr %20, align 2 + %21 = getelementptr inbounds i8, ptr %9, i32 4 + %22 = load i16, ptr %21, align 2 + %23 = getelementptr inbounds i8, ptr %11, i32 4 + %24 = load i16, ptr %23, align 2 + %25 = sub i16 %22, %24 + %26 = getelementptr inbounds i8, ptr %14, i32 4 + store i16 %25, ptr %26, align 2 + %27 = getelementptr inbounds i8, ptr %9, i32 6 + %28 = load i16, ptr %27, align 2 + %29 = getelementptr inbounds i8, ptr %11, i32 6 + %30 = load i16, ptr %29, align 2 + %31 = sub i16 %28, %30 + %32 = getelementptr inbounds i8, ptr %14, i32 6 + store i16 %31, ptr %32, align 2 + %33 = add nuw i32 %8, 1 + %34 = icmp eq i32 %33, %3 + br i1 %34, label %6, label %7 +} + +; CHECK-LABEL: four_shorts_split_op: +; CHECK: loop +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.or +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.or +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.xor +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.xor +; CHECK: i32.store16 +define hidden void @four_shorts_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %33, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8 + %10 = load i16, ptr %9, align 2 + %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8 + %12 = load i16, ptr %11, align 2 + %13 = or i16 %12, %10 + %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8 + store i16 %13, ptr %14, align 2 + %15 = getelementptr inbounds i8, ptr %9, i32 2 + %16 = load i16, ptr %15, align 2 + %17 = getelementptr inbounds i8, ptr %11, i32 2 + %18 = load i16, ptr %17, align 2 + %19 = or i16 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 2 + store i16 %19, ptr %20, align 2 + %21 = getelementptr inbounds i8, ptr %9, i32 4 + %22 = load i16, ptr %21, align 2 + %23 = getelementptr inbounds i8, ptr %11, i32 4 + %24 = load i16, ptr %23, align 2 + %25 = xor i16 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 4 + store i16 %25, ptr %26, align 2 + %27 = getelementptr inbounds i8, ptr %9, i32 6 + %28 = load i16, ptr %27, align 2 + %29 = getelementptr inbounds i8, ptr %11, i32 6 + %30 = load i16, ptr %29, align 2 + %31 = xor i16 %30, %28 + %32 = getelementptr inbounds i8, ptr %14, i32 6 + store i16 %31, ptr %32, align 2 + %33 = add nuw i32 %8, 1 + %34 = icmp eq i32 %33, %3 + br i1 %34, label %6, label %7 +} + +; CHECK-LABEL: four_shorts_interleave_op: +; CHECK: loop +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.or +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.xor +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.or +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.xor +; CHECK: i32.store16 +define hidden void @four_shorts_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %33, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.FourShorts, ptr %1, i32 %8 + %10 = load i16, ptr %9, align 2 + %11 = getelementptr inbounds %struct.FourShorts, ptr %2, i32 %8 + %12 = load i16, ptr %11, align 2 + %13 = or i16 %12, %10 + %14 = getelementptr inbounds %struct.FourShorts, ptr %0, i32 %8 + store i16 %13, ptr %14, align 2 + %15 = getelementptr inbounds i8, ptr %9, i32 2 + %16 = load i16, ptr %15, align 2 + %17 = getelementptr inbounds i8, ptr %11, i32 2 + %18 = load i16, ptr %17, align 2 + %19 = xor i16 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 2 + store i16 %19, ptr %20, align 2 + %21 = getelementptr inbounds i8, ptr %9, i32 4 + %22 = load i16, ptr %21, align 2 + %23 = getelementptr inbounds i8, ptr %11, i32 4 + %24 = load i16, ptr %23, align 2 + %25 = or i16 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 4 + store i16 %25, ptr %26, align 2 + %27 = getelementptr inbounds i8, ptr %9, i32 6 + %28 = load i16, ptr %27, align 2 + %29 = getelementptr inbounds i8, ptr %11, i32 6 + %30 = load i16, ptr %29, align 2 + %31 = xor i16 %30, %28 + %32 = getelementptr inbounds i8, ptr %14, i32 6 + store i16 %31, ptr %32, align 2 + %33 = add nuw i32 %8, 1 + %34 = icmp eq i32 %33, %3 + br i1 %34, label %6, label %7 +} + +; CHECK-LABEL: five_shorts: +; CHECK: loop +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.sub +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.sub +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.sub +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.sub +; CHECK: i32.store16 +; CHECK: i32.load16_u +; CHECK: i32.load16_u +; CHECK: i32.sub +; CHECK: i32.store16 +define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %39, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.FiveShorts, ptr %1, i32 %8 + %10 = load i16, ptr %9, align 1 + %11 = getelementptr inbounds %struct.FiveShorts, ptr %2, i32 %8 + %12 = load i16, ptr %11, align 1 + %13 = sub i16 %10, %12 + %14 = getelementptr inbounds %struct.FiveShorts, ptr %0, i32 %8 + store i16 %13, ptr %14, align 1 + %15 = getelementptr inbounds i16, ptr %9, i32 1 + %16 = load i16, ptr %15, align 1 + %17 = getelementptr inbounds i16, ptr %11, i32 1 + %18 = load i16, ptr %17, align 1 + %19 = sub i16 %16, %18 + %20 = getelementptr inbounds i16, ptr %14, i32 1 + store i16 %19, ptr %20, align 1 + %21 = getelementptr inbounds i16, ptr %9, i32 2 + %22 = load i16, ptr %21, align 1 + %23 = getelementptr inbounds i16, ptr %11, i32 2 + %24 = load i16, ptr %23, align 1 + %25 = sub i16 %22, %24 + %26 = getelementptr inbounds i16, ptr %14, i32 2 + store i16 %25, ptr %26, align 1 + %27 = getelementptr inbounds i16, ptr %9, i32 3 + %28 = load i16, ptr %27, align 1 + %29 = getelementptr inbounds i16, ptr %11, i32 3 + %30 = load i16, ptr %29, align 1 + %31 = sub i16 %28, %30 + %32 = getelementptr inbounds i16, ptr %14, i32 3 + store i16 %31, ptr %32, align 1 + %33 = getelementptr inbounds i16, ptr %9, i32 4 + %34 = load i16, ptr %33, align 1 + %35 = getelementptr inbounds i16, ptr %11, i32 4 + %36 = load i16, ptr %35, align 1 + %37 = sub i16 %34, %36 + %38 = getelementptr inbounds i16, ptr %14, i32 4 + store i16 %37, ptr %38, align 1 + %39 = add nuw i32 %8, 1 + %40 = icmp eq i32 %39, %3 + br i1 %40, label %6, label %7 +} + +; CHECK-LABEL: two_bytes_same_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %21, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.TwoBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = getelementptr inbounds %struct.TwoBytes, ptr %2, i32 %8 + %12 = load i8, ptr %11, align 1 + %13 = mul i8 %12, %10 + %14 = getelementptr inbounds %struct.TwoBytes, ptr %0, i32 %8 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %9, i32 1 + %16 = load i8, ptr %15, align 1 + %17 = getelementptr inbounds i8, ptr %11, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = mul i8 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 1 + store i8 %19, ptr %20, align 1 + %21 = add nuw i32 %8, 1 + %22 = icmp eq i32 %21, %3 + br i1 %22, label %6, label %7 +} + +; CHECK-LABEL: two_bytes_vary_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %21, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.TwoBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = getelementptr inbounds %struct.TwoBytes, ptr %2, i32 %8 + %12 = load i8, ptr %11, align 1 + %13 = mul i8 %12, %10 + %14 = getelementptr inbounds %struct.TwoBytes, ptr %0, i32 %8 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %9, i32 1 + %16 = load i8, ptr %15, align 1 + %17 = getelementptr inbounds i8, ptr %11, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = sub i8 %16, %18 + %20 = getelementptr inbounds i8, ptr %14, i32 1 + store i8 %19, ptr %20, align 1 + %21 = add nuw i32 %8, 1 + %22 = icmp eq i32 %21, %3 + br i1 %22, label %6, label %7 +} + +; CHECK-LABEL: three_bytes_same_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.and +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.and +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.and +; CHECK: i32.store8 +define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %27, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.ThreeBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = getelementptr inbounds %struct.ThreeBytes, ptr %2, i32 %8 + %12 = load i8, ptr %11, align 1 + %13 = and i8 %12, %10 + %14 = getelementptr inbounds %struct.ThreeBytes, ptr %0, i32 %8 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %9, i32 1 + %16 = load i8, ptr %15, align 1 + %17 = getelementptr inbounds i8, ptr %11, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = and i8 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 1 + store i8 %19, ptr %20, align 1 + %21 = getelementptr inbounds i8, ptr %9, i32 2 + %22 = load i8, ptr %21, align 1 + %23 = getelementptr inbounds i8, ptr %11, i32 2 + %24 = load i8, ptr %23, align 1 + %25 = and i8 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 2 + store i8 %25, ptr %26, align 1 + %27 = add nuw i32 %8, 1 + %28 = icmp eq i32 %27, %3 + br i1 %28, label %6, label %7 +} + +; CHECK-LABEL: three_bytes_interleave_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %27, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.ThreeBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = getelementptr inbounds %struct.ThreeBytes, ptr %2, i32 %8 + %12 = load i8, ptr %11, align 1 + %13 = add i8 %12, %10 + %14 = getelementptr inbounds %struct.ThreeBytes, ptr %0, i32 %8 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %9, i32 1 + %16 = load i8, ptr %15, align 1 + %17 = getelementptr inbounds i8, ptr %11, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = sub i8 %16, %18 + %20 = getelementptr inbounds i8, ptr %14, i32 1 + store i8 %19, ptr %20, align 1 + %21 = getelementptr inbounds i8, ptr %9, i32 2 + %22 = load i8, ptr %21, align 1 + %23 = getelementptr inbounds i8, ptr %11, i32 2 + %24 = load i8, ptr %23, align 1 + %25 = add i8 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 2 + store i8 %25, ptr %26, align 1 + %27 = add nuw i32 %8, 1 + %28 = icmp eq i32 %27, %3 + br i1 %28, label %6, label %7 +} + +; CHECK-LABEL: four_bytes_same_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.and +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.and +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.and +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.and +; CHECK: i32.store8 +define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %33, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8 + %12 = load i8, ptr %11, align 1 + %13 = and i8 %12, %10 + %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %9, i32 1 + %16 = load i8, ptr %15, align 1 + %17 = getelementptr inbounds i8, ptr %11, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = and i8 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 1 + store i8 %19, ptr %20, align 1 + %21 = getelementptr inbounds i8, ptr %9, i32 2 + %22 = load i8, ptr %21, align 1 + %23 = getelementptr inbounds i8, ptr %11, i32 2 + %24 = load i8, ptr %23, align 1 + %25 = and i8 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 2 + store i8 %25, ptr %26, align 1 + %27 = getelementptr inbounds i8, ptr %9, i32 3 + %28 = load i8, ptr %27, align 1 + %29 = getelementptr inbounds i8, ptr %11, i32 3 + %30 = load i8, ptr %29, align 1 + %31 = and i8 %30, %28 + %32 = getelementptr inbounds i8, ptr %14, i32 3 + store i8 %31, ptr %32, align 1 + %33 = add nuw i32 %8, 1 + %34 = icmp eq i32 %33, %3 + br i1 %34, label %6, label %7 +} + +; CHECK-LABEL: four_bytes_split_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %33, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8 + %12 = load i8, ptr %11, align 1 + %13 = mul i8 %12, %10 + %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %9, i32 1 + %16 = load i8, ptr %15, align 1 + %17 = getelementptr inbounds i8, ptr %11, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = mul i8 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 1 + store i8 %19, ptr %20, align 1 + %21 = getelementptr inbounds i8, ptr %9, i32 2 + %22 = load i8, ptr %21, align 1 + %23 = getelementptr inbounds i8, ptr %11, i32 2 + %24 = load i8, ptr %23, align 1 + %25 = sub i8 %22, %24 + %26 = getelementptr inbounds i8, ptr %14, i32 2 + store i8 %25, ptr %26, align 1 + %27 = getelementptr inbounds i8, ptr %9, i32 3 + %28 = load i8, ptr %27, align 1 + %29 = getelementptr inbounds i8, ptr %11, i32 3 + %30 = load i8, ptr %29, align 1 + %31 = sub i8 %28, %30 + %32 = getelementptr inbounds i8, ptr %14, i32 3 + store i8 %31, ptr %32, align 1 + %33 = add nuw i32 %8, 1 + %34 = icmp eq i32 %33, %3 + br i1 %34, label %6, label %7 +} + +; CHECK-LABEL: four_bytes_interleave_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %33, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8 + %12 = load i8, ptr %11, align 1 + %13 = add i8 %12, %10 + %14 = getelementptr inbounds %struct.FourBytes, ptr %0, i32 %8 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %9, i32 1 + %16 = load i8, ptr %15, align 1 + %17 = getelementptr inbounds i8, ptr %11, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = sub i8 %16, %18 + %20 = getelementptr inbounds i8, ptr %14, i32 1 + store i8 %19, ptr %20, align 1 + %21 = getelementptr inbounds i8, ptr %9, i32 2 + %22 = load i8, ptr %21, align 1 + %23 = getelementptr inbounds i8, ptr %11, i32 2 + %24 = load i8, ptr %23, align 1 + %25 = add i8 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 2 + store i8 %25, ptr %26, align 1 + %27 = getelementptr inbounds i8, ptr %9, i32 3 + %28 = load i8, ptr %27, align 1 + %29 = getelementptr inbounds i8, ptr %11, i32 3 + %30 = load i8, ptr %29, align 1 + %31 = sub i8 %28, %30 + %32 = getelementptr inbounds i8, ptr %14, i32 3 + store i8 %31, ptr %32, align 1 + %33 = add nuw i32 %8, 1 + %34 = icmp eq i32 %33, %3 + br i1 %34, label %6, label %7 +} + +; CHECK-LABEL: eight_bytes_same_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store8 +define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %57, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8 + %12 = load i8, ptr %11, align 1 + %13 = mul i8 %12, %10 + %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %9, i32 1 + %16 = load i8, ptr %15, align 1 + %17 = getelementptr inbounds i8, ptr %11, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = mul i8 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 1 + store i8 %19, ptr %20, align 1 + %21 = getelementptr inbounds i8, ptr %9, i32 2 + %22 = load i8, ptr %21, align 1 + %23 = getelementptr inbounds i8, ptr %11, i32 2 + %24 = load i8, ptr %23, align 1 + %25 = mul i8 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 2 + store i8 %25, ptr %26, align 1 + %27 = getelementptr inbounds i8, ptr %9, i32 3 + %28 = load i8, ptr %27, align 1 + %29 = getelementptr inbounds i8, ptr %11, i32 3 + %30 = load i8, ptr %29, align 1 + %31 = mul i8 %30, %28 + %32 = getelementptr inbounds i8, ptr %14, i32 3 + store i8 %31, ptr %32, align 1 + %33 = getelementptr inbounds i8, ptr %9, i32 4 + %34 = load i8, ptr %33, align 1 + %35 = getelementptr inbounds i8, ptr %11, i32 4 + %36 = load i8, ptr %35, align 1 + %37 = mul i8 %36, %34 + %38 = getelementptr inbounds i8, ptr %14, i32 4 + store i8 %37, ptr %38, align 1 + %39 = getelementptr inbounds i8, ptr %9, i32 5 + %40 = load i8, ptr %39, align 1 + %41 = getelementptr inbounds i8, ptr %11, i32 5 + %42 = load i8, ptr %41, align 1 + %43 = mul i8 %42, %40 + %44 = getelementptr inbounds i8, ptr %14, i32 5 + store i8 %43, ptr %44, align 1 + %45 = getelementptr inbounds i8, ptr %9, i32 6 + %46 = load i8, ptr %45, align 1 + %47 = getelementptr inbounds i8, ptr %11, i32 6 + %48 = load i8, ptr %47, align 1 + %49 = mul i8 %48, %46 + %50 = getelementptr inbounds i8, ptr %14, i32 6 + store i8 %49, ptr %50, align 1 + %51 = getelementptr inbounds i8, ptr %9, i32 7 + %52 = load i8, ptr %51, align 1 + %53 = getelementptr inbounds i8, ptr %11, i32 7 + %54 = load i8, ptr %53, align 1 + %55 = mul i8 %54, %52 + %56 = getelementptr inbounds i8, ptr %14, i32 7 + store i8 %55, ptr %56, align 1 + %57 = add nuw i32 %8, 1 + %58 = icmp eq i32 %57, %3 + br i1 %58, label %6, label %7 +} + +; CHECK-LABEL: eight_bytes_split_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +define hidden void @eight_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %57, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8 + %12 = load i8, ptr %11, align 1 + %13 = add i8 %12, %10 + %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %9, i32 1 + %16 = load i8, ptr %15, align 1 + %17 = getelementptr inbounds i8, ptr %11, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = add i8 %18, %16 + %20 = getelementptr inbounds i8, ptr %14, i32 1 + store i8 %19, ptr %20, align 1 + %21 = getelementptr inbounds i8, ptr %9, i32 2 + %22 = load i8, ptr %21, align 1 + %23 = getelementptr inbounds i8, ptr %11, i32 2 + %24 = load i8, ptr %23, align 1 + %25 = add i8 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 2 + store i8 %25, ptr %26, align 1 + %27 = getelementptr inbounds i8, ptr %9, i32 3 + %28 = load i8, ptr %27, align 1 + %29 = getelementptr inbounds i8, ptr %11, i32 3 + %30 = load i8, ptr %29, align 1 + %31 = add i8 %30, %28 + %32 = getelementptr inbounds i8, ptr %14, i32 3 + store i8 %31, ptr %32, align 1 + %33 = getelementptr inbounds i8, ptr %9, i32 4 + %34 = load i8, ptr %33, align 1 + %35 = getelementptr inbounds i8, ptr %11, i32 4 + %36 = load i8, ptr %35, align 1 + %37 = sub i8 %34, %36 + %38 = getelementptr inbounds i8, ptr %14, i32 4 + store i8 %37, ptr %38, align 1 + %39 = getelementptr inbounds i8, ptr %9, i32 5 + %40 = load i8, ptr %39, align 1 + %41 = getelementptr inbounds i8, ptr %11, i32 5 + %42 = load i8, ptr %41, align 1 + %43 = sub i8 %40, %42 + %44 = getelementptr inbounds i8, ptr %14, i32 5 + store i8 %43, ptr %44, align 1 + %45 = getelementptr inbounds i8, ptr %9, i32 6 + %46 = load i8, ptr %45, align 1 + %47 = getelementptr inbounds i8, ptr %11, i32 6 + %48 = load i8, ptr %47, align 1 + %49 = sub i8 %46, %48 + %50 = getelementptr inbounds i8, ptr %14, i32 6 + store i8 %49, ptr %50, align 1 + %51 = getelementptr inbounds i8, ptr %9, i32 7 + %52 = load i8, ptr %51, align 1 + %53 = getelementptr inbounds i8, ptr %11, i32 7 + %54 = load i8, ptr %53, align 1 + %55 = sub i8 %52, %54 + %56 = getelementptr inbounds i8, ptr %14, i32 7 + store i8 %55, ptr %56, align 1 + %57 = add nuw i32 %8, 1 + %58 = icmp eq i32 %57, %3 + br i1 %58, label %6, label %7 +} + +; CHECK-LABEL: eight_bytes_interleave_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store8 +define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %57, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.EightBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = getelementptr inbounds %struct.EightBytes, ptr %2, i32 %8 + %12 = load i8, ptr %11, align 1 + %13 = add i8 %12, %10 + %14 = getelementptr inbounds %struct.EightBytes, ptr %0, i32 %8 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %9, i32 1 + %16 = load i8, ptr %15, align 1 + %17 = getelementptr inbounds i8, ptr %11, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = sub i8 %16, %18 + %20 = getelementptr inbounds i8, ptr %14, i32 1 + store i8 %19, ptr %20, align 1 + %21 = getelementptr inbounds i8, ptr %9, i32 2 + %22 = load i8, ptr %21, align 1 + %23 = getelementptr inbounds i8, ptr %11, i32 2 + %24 = load i8, ptr %23, align 1 + %25 = add i8 %24, %22 + %26 = getelementptr inbounds i8, ptr %14, i32 2 + store i8 %25, ptr %26, align 1 + %27 = getelementptr inbounds i8, ptr %9, i32 3 + %28 = load i8, ptr %27, align 1 + %29 = getelementptr inbounds i8, ptr %11, i32 3 + %30 = load i8, ptr %29, align 1 + %31 = sub i8 %28, %30 + %32 = getelementptr inbounds i8, ptr %14, i32 3 + store i8 %31, ptr %32, align 1 + %33 = getelementptr inbounds i8, ptr %9, i32 4 + %34 = load i8, ptr %33, align 1 + %35 = getelementptr inbounds i8, ptr %11, i32 4 + %36 = load i8, ptr %35, align 1 + %37 = add i8 %36, %34 + %38 = getelementptr inbounds i8, ptr %14, i32 4 + store i8 %37, ptr %38, align 1 + %39 = getelementptr inbounds i8, ptr %9, i32 5 + %40 = load i8, ptr %39, align 1 + %41 = getelementptr inbounds i8, ptr %11, i32 5 + %42 = load i8, ptr %41, align 1 + %43 = sub i8 %40, %42 + %44 = getelementptr inbounds i8, ptr %14, i32 5 + store i8 %43, ptr %44, align 1 + %45 = getelementptr inbounds i8, ptr %9, i32 6 + %46 = load i8, ptr %45, align 1 + %47 = getelementptr inbounds i8, ptr %11, i32 6 + %48 = load i8, ptr %47, align 1 + %49 = add i8 %48, %46 + %50 = getelementptr inbounds i8, ptr %14, i32 6 + store i8 %49, ptr %50, align 1 + %51 = getelementptr inbounds i8, ptr %9, i32 7 + %52 = load i8, ptr %51, align 1 + %53 = getelementptr inbounds i8, ptr %11, i32 7 + %54 = load i8, ptr %53, align 1 + %55 = sub i8 %52, %54 + %56 = getelementptr inbounds i8, ptr %14, i32 7 + store i8 %55, ptr %56, align 1 + %57 = add nuw i32 %8, 1 + %58 = icmp eq i32 %57, %3 + br i1 %58, label %6, label %7 +} + +; CHECK-LABEL: four_bytes_into_four_ints_same_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.load +; CHECK: i32.add +; CHECK: i32.store +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.load +; CHECK: i32.add +; CHECK: i32.store +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.load +; CHECK: i32.add +; CHECK: i32.store +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.load +; CHECK: i32.add +; CHECK: i32.store +define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noundef %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %49, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = zext i8 %10 to i32 + %12 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8 + %13 = load i8, ptr %12, align 1 + %14 = zext i8 %13 to i32 + %15 = mul nuw nsw i32 %14, %11 + %16 = getelementptr inbounds %struct.FourInts, ptr %0, i32 %8 + %17 = load i32, ptr %16, align 4 + %18 = add nsw i32 %15, %17 + store i32 %18, ptr %16, align 4 + %19 = getelementptr inbounds i8, ptr %9, i32 1 + %20 = load i8, ptr %19, align 1 + %21 = zext i8 %20 to i32 + %22 = getelementptr inbounds i8, ptr %12, i32 1 + %23 = load i8, ptr %22, align 1 + %24 = zext i8 %23 to i32 + %25 = mul nuw nsw i32 %24, %21 + %26 = getelementptr inbounds i8, ptr %16, i32 4 + %27 = load i32, ptr %26, align 4 + %28 = add nsw i32 %25, %27 + store i32 %28, ptr %26, align 4 + %29 = getelementptr inbounds i8, ptr %9, i32 2 + %30 = load i8, ptr %29, align 1 + %31 = zext i8 %30 to i32 + %32 = getelementptr inbounds i8, ptr %12, i32 2 + %33 = load i8, ptr %32, align 1 + %34 = zext i8 %33 to i32 + %35 = mul nuw nsw i32 %34, %31 + %36 = getelementptr inbounds i8, ptr %16, i32 8 + %37 = load i32, ptr %36, align 4 + %38 = add nsw i32 %35, %37 + store i32 %38, ptr %36, align 4 + %39 = getelementptr inbounds i8, ptr %9, i32 3 + %40 = load i8, ptr %39, align 1 + %41 = zext i8 %40 to i32 + %42 = getelementptr inbounds i8, ptr %12, i32 3 + %43 = load i8, ptr %42, align 1 + %44 = zext i8 %43 to i32 + %45 = mul nuw nsw i32 %44, %41 + %46 = getelementptr inbounds i8, ptr %16, i32 12 + %47 = load i32, ptr %46, align 4 + %48 = add nsw i32 %45, %47 + store i32 %48, ptr %46, align 4 + %49 = add nuw i32 %8, 1 + %50 = icmp eq i32 %49, %3 + br i1 %50, label %6, label %7 +} + +; CHECK-LABEL: four_bytes_into_four_ints_vary_op: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.add +; CHECK: i32.store +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.sub +; CHECK: i32.store +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.mul +; CHECK: i32.store +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.and +; CHECK: i32.store +define hidden void @four_bytes_into_four_ints_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) { + %5 = icmp eq i32 %3, 0 + br i1 %5, label %6, label %7 + +6: ; preds = %7, %4 + ret void + +7: ; preds = %4, %7 + %8 = phi i32 [ %40, %7 ], [ 0, %4 ] + %9 = getelementptr inbounds %struct.FourBytes, ptr %1, i32 %8 + %10 = load i8, ptr %9, align 1 + %11 = zext i8 %10 to i32 + %12 = getelementptr inbounds %struct.FourBytes, ptr %2, i32 %8 + %13 = load i8, ptr %12, align 1 + %14 = zext i8 %13 to i32 + %15 = add nuw nsw i32 %14, %11 + %16 = getelementptr inbounds %struct.FourInts, ptr %0, i32 %8 + store i32 %15, ptr %16, align 4 + %17 = getelementptr inbounds i8, ptr %9, i32 1 + %18 = load i8, ptr %17, align 1 + %19 = zext i8 %18 to i32 + %20 = getelementptr inbounds i8, ptr %12, i32 1 + %21 = load i8, ptr %20, align 1 + %22 = zext i8 %21 to i32 + %23 = sub nsw i32 %19, %22 + %24 = getelementptr inbounds i8, ptr %16, i32 4 + store i32 %23, ptr %24, align 4 + %25 = getelementptr inbounds i8, ptr %9, i32 2 + %26 = load i8, ptr %25, align 1 + %27 = zext i8 %26 to i32 + %28 = getelementptr inbounds i8, ptr %12, i32 2 + %29 = load i8, ptr %28, align 1 + %30 = zext i8 %29 to i32 + %31 = mul nuw nsw i32 %30, %27 + %32 = getelementptr inbounds i8, ptr %16, i32 8 + store i32 %31, ptr %32, align 4 + %33 = getelementptr inbounds i8, ptr %9, i32 3 + %34 = load i8, ptr %33, align 1 + %35 = getelementptr inbounds i8, ptr %12, i32 3 + %36 = load i8, ptr %35, align 1 + %37 = and i8 %36, %34 + %38 = zext i8 %37 to i32 + %39 = getelementptr inbounds i8, ptr %16, i32 12 + store i32 %38, ptr %39, align 4 + %40 = add nuw i32 %8, 1 + %41 = icmp eq i32 %40, %3 + br i1 %41, label %6, label %7 +} + +; CHECK-LABEL: scale_uv_row_down2: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.store8 +define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) { + %5 = icmp sgt i32 %3, 0 + br i1 %5, label %6, label %19 + +6: ; preds = %4, %6 + %7 = phi i32 [ %17, %6 ], [ 0, %4 ] + %8 = phi ptr [ %15, %6 ], [ %0, %4 ] + %9 = phi ptr [ %16, %6 ], [ %2, %4 ] + %10 = getelementptr inbounds i8, ptr %8, i32 2 + %11 = load i8, ptr %10, align 1 + store i8 %11, ptr %9, align 1 + %12 = getelementptr inbounds i8, ptr %8, i32 3 + %13 = load i8, ptr %12, align 1 + %14 = getelementptr inbounds i8, ptr %9, i32 1 + store i8 %13, ptr %14, align 1 + %15 = getelementptr inbounds i8, ptr %8, i32 4 + %16 = getelementptr inbounds i8, ptr %9, i32 2 + %17 = add nuw nsw i32 %7, 1 + %18 = icmp eq i32 %17, %3 + br i1 %18, label %19, label %6 + +19: ; preds = %6, %4 + ret void +} + +; CHECK-LABEL: scale_uv_row_down2_box: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.shr_u +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.shr_u +; CHECK: i32.store8 +define hidden void @scale_uv_row_down2_box(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) { + %5 = icmp sgt i32 %3, 0 + br i1 %5, label %6, label %54 + +6: ; preds = %4 + %7 = add nsw i32 %1, 2 + %8 = add nsw i32 %1, 1 + %9 = add nsw i32 %1, 3 + br label %10 + +10: ; preds = %6, %10 + %11 = phi i32 [ 0, %6 ], [ %52, %10 ] + %12 = phi ptr [ %0, %6 ], [ %50, %10 ] + %13 = phi ptr [ %2, %6 ], [ %51, %10 ] + %14 = load i8, ptr %12, align 1 + %15 = zext i8 %14 to i16 + %16 = getelementptr inbounds i8, ptr %12, i32 2 + %17 = load i8, ptr %16, align 1 + %18 = zext i8 %17 to i16 + %19 = getelementptr inbounds i8, ptr %12, i32 %1 + %20 = load i8, ptr %19, align 1 + %21 = zext i8 %20 to i16 + %22 = getelementptr inbounds i8, ptr %12, i32 %7 + %23 = load i8, ptr %22, align 1 + %24 = zext i8 %23 to i16 + %25 = add nuw nsw i16 %15, 2 + %26 = add nuw nsw i16 %25, %18 + %27 = add nuw nsw i16 %26, %21 + %28 = add nuw nsw i16 %27, %24 + %29 = lshr i16 %28, 2 + %30 = trunc nuw i16 %29 to i8 + store i8 %30, ptr %13, align 1 + %31 = getelementptr inbounds i8, ptr %12, i32 1 + %32 = load i8, ptr %31, align 1 + %33 = zext i8 %32 to i16 + %34 = getelementptr inbounds i8, ptr %12, i32 3 + %35 = load i8, ptr %34, align 1 + %36 = zext i8 %35 to i16 + %37 = getelementptr inbounds i8, ptr %12, i32 %8 + %38 = load i8, ptr %37, align 1 + %39 = zext i8 %38 to i16 + %40 = getelementptr inbounds i8, ptr %12, i32 %9 + %41 = load i8, ptr %40, align 1 + %42 = zext i8 %41 to i16 + %43 = add nuw nsw i16 %33, 2 + %44 = add nuw nsw i16 %43, %36 + %45 = add nuw nsw i16 %44, %39 + %46 = add nuw nsw i16 %45, %42 + %47 = lshr i16 %46, 2 + %48 = trunc nuw i16 %47 to i8 + %49 = getelementptr inbounds i8, ptr %13, i32 1 + store i8 %48, ptr %49, align 1 + %50 = getelementptr inbounds i8, ptr %12, i32 4 + %51 = getelementptr inbounds i8, ptr %13, i32 2 + %52 = add nuw nsw i32 %11, 1 + %53 = icmp eq i32 %52, %3 + br i1 %53, label %54, label %10 + +54: ; preds = %10, %4 + ret void +} + +; CHECK-LABEL: scale_uv_row_down2_linear: +; CHECK: loop +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.shr_u +; CHECK: i32.store8 +; CHECK: i32.load8_u +; CHECK: i32.load8_u +; CHECK: i32.shr_u +; CHECK: i32.store8 +define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0, i32 noundef %1, ptr nocapture noundef writeonly %2, i32 noundef %3) { + %5 = icmp sgt i32 %3, 0 + br i1 %5, label %6, label %34 + +6: ; preds = %4, %6 + %7 = phi i32 [ %32, %6 ], [ 0, %4 ] + %8 = phi ptr [ %30, %6 ], [ %0, %4 ] + %9 = phi ptr [ %31, %6 ], [ %2, %4 ] + %10 = load i8, ptr %8, align 1 + %11 = zext i8 %10 to i16 + %12 = getelementptr inbounds i8, ptr %8, i32 2 + %13 = load i8, ptr %12, align 1 + %14 = zext i8 %13 to i16 + %15 = add nuw nsw i16 %11, 1 + %16 = add nuw nsw i16 %15, %14 + %17 = lshr i16 %16, 1 + %18 = trunc nuw i16 %17 to i8 + store i8 %18, ptr %9, align 1 + %19 = getelementptr inbounds i8, ptr %8, i32 1 + %20 = load i8, ptr %19, align 1 + %21 = zext i8 %20 to i16 + %22 = getelementptr inbounds i8, ptr %8, i32 3 + %23 = load i8, ptr %22, align 1 + %24 = zext i8 %23 to i16 + %25 = add nuw nsw i16 %21, 1 + %26 = add nuw nsw i16 %25, %24 + %27 = lshr i16 %26, 1 + %28 = trunc nuw i16 %27 to i8 + %29 = getelementptr inbounds i8, ptr %9, i32 1 + store i8 %28, ptr %29, align 1 + %30 = getelementptr inbounds i8, ptr %8, i32 4 + %31 = getelementptr inbounds i8, ptr %9, i32 2 + %32 = add nuw nsw i32 %7, 1 + %33 = icmp eq i32 %32, %3 + br i1 %33, label %34, label %6 + +34: ; preds = %6, %4 + ret void +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll index 8459ec8..b355a0d 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll @@ -441,3 +441,31 @@ define <2 x double> @promote_mixed_v2f64(<4 x float> %x, <4 x float> %y) { %a = fpext <2 x float> %v to <2 x double> ret <2 x double> %a } + +define <4 x float> @convert_u_v4f32_maybeneg(<4 x i32> %x) { +; CHECK-LABEL: convert_u_v4f32_maybeneg: +; CHECK: .functype convert_u_v4f32_maybeneg (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32x4.shr_s +; CHECK-NEXT: f32x4.convert_i32x4_u +; CHECK-NEXT: # fallthrough-return + %a = ashr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> + %b = uitofp <4 x i32> %a to <4 x float> + ret <4 x float> %b +} + +define <4 x float> @convert_u_v4f32_nonneg(<4 x i32> %x) { +; CHECK-LABEL: convert_u_v4f32_nonneg: +; CHECK: .functype convert_u_v4f32_nonneg (v128) -> (v128) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.const 1 +; CHECK-NEXT: i32x4.shr_u +; CHECK-NEXT: f32x4.convert_i32x4_s +; CHECK-NEXT: # fallthrough-return + %a = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> + %b = uitofp <4 x i32> %a to <4 x float> + ret <4 x float> %b +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll index c93b8aa..eb39f90 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll @@ -12,7 +12,7 @@ define <4 x float> @extend_to_float_low_i16x8_u(<8 x i16> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.extend_low_i16x8_u -; CHECK-NEXT: f32x4.convert_i32x4_u +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %extended = uitofp <4 x i16> %low to <4 x float> @@ -25,7 +25,7 @@ define <4 x float> @extend_to_float_high_i16x8_u(<8 x i16> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.extend_high_i16x8_u -; CHECK-NEXT: f32x4.convert_i32x4_u +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %high = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %extended = uitofp <4 x i16> %high to <4 x float> @@ -39,7 +39,7 @@ define <4 x float> @extend_to_float_low_i8x16_u(<8 x i8> %x) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i16x8.extend_low_i8x16_u ; CHECK-NEXT: i32x4.extend_low_i16x8_u -; CHECK-NEXT: f32x4.convert_i32x4_u +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %extended = uitofp <4 x i8> %low to <4 x float> @@ -55,7 +55,7 @@ define <4 x float> @extend_to_float_high_i8x16_u(<8 x i8> %x) { ; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: i16x8.extend_low_i8x16_u ; CHECK-NEXT: i32x4.extend_low_i16x8_u -; CHECK-NEXT: f32x4.convert_i32x4_u +; CHECK-NEXT: f32x4.convert_i32x4_s ; CHECK-NEXT: # fallthrough-return %high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %extended = uitofp <4 x i8> %high to <4 x float> @@ -136,7 +136,7 @@ define <2 x double> @extend_to_double_low_i16x4_u(<4 x i16> %x) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i32x4.extend_low_i16x8_u -; CHECK-NEXT: f64x2.convert_low_i32x4_u +; CHECK-NEXT: f64x2.convert_low_i32x4_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <4 x i16> %x, <4 x i16> undef, <2 x i32> <i32 0, i32 1> %extended = uitofp <2 x i16> %low to <2 x double> diff --git a/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll b/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll index 3b3a460..ab6672e 100644 --- a/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll +++ b/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll @@ -1,4 +1,4 @@ -; RUN: sed -e s/.Cxx:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=CXX,X64CXX +; RUN: sed -e s/.Cxx:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=CXX ; RUN: sed -e s/.Seh:// %s | llc -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=SEH ; RUN: %if aarch64-registered-target %{ sed -e s/.Cxx:// %s | llc -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefix=CXX %} ; RUN: %if aarch64-registered-target %{ sed -e s/.Seh:// %s | llc -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefix=SEH %} @@ -49,18 +49,14 @@ catch.body.2: ; CXX-NEXT: .[[ENTRY:long|word]] .Lfunc_begin0@IMGREL ; CXX-NEXT: .[[ENTRY]] -1 ; CXX-NEXT: .[[ENTRY]] .Ltmp0@IMGREL -; X64CXX-SAME: +1 ; CXX-NEXT: .[[ENTRY]] 1 ; CXX-NEXT: .[[ENTRY]] .Ltmp1@IMGREL -; X64CXX-SAME: +1 ; CXX-NEXT: .[[ENTRY]] -1 ; CXX-NEXT: .[[ENTRY]] "?catch$3@?0?test@4HA"@IMGREL ; CXX-NEXT: .[[ENTRY]] 2 ; CXX-NEXT: .[[ENTRY]] .Ltmp2@IMGREL -; X64CXX-SAME: +1 ; CXX-NEXT: .[[ENTRY]] 3 ; CXX-NEXT: .[[ENTRY]] .Ltmp3@IMGREL -; X64CXX-SAME: +1 ; CXX-NEXT: .[[ENTRY]] 2 ; CXX-NEXT: .[[ENTRY]] "?catch$5@?0?test@4HA"@IMGREL ; CXX-NEXT: .[[ENTRY]] 4 @@ -70,19 +66,19 @@ catch.body.2: ; SEH: .LBB0_[[CATCH:[0-9]+]]: {{.*}} %catch.body ; SEH-LABEL: .Llsda_begin0: ; SEH-NEXT: .[[ENTRY:long|word]] .Ltmp0@IMGREL -; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL+1 +; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL ; SEH-NEXT: .[[ENTRY]] dummy_filter@IMGREL ; SEH-NEXT: .[[ENTRY]] .LBB0_[[CATCH]]@IMGREL ; SEH-NEXT: .[[ENTRY]] .Ltmp0@IMGREL -; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL+1 +; SEH-NEXT: .[[ENTRY]] .Ltmp1@IMGREL ; SEH-NEXT: .[[ENTRY]] dummy_filter@IMGREL ; SEH-NEXT: .[[ENTRY]] .LBB0_[[CATCH2]]@IMGREL ; SEH-NEXT: .[[ENTRY]] .Ltmp2@IMGREL -; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL+1 +; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL ; SEH-NEXT: .[[ENTRY]] "?dtor$[[DTOR:[0-9]+]]@?0?test@4HA"@IMGREL ; SEH-NEXT: .[[ENTRY]] 0 ; SEH-NEXT: .[[ENTRY]] .Ltmp2@IMGREL -; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL+1 +; SEH-NEXT: .[[ENTRY]] .Ltmp3@IMGREL ; SEH-NEXT: .[[ENTRY]] dummy_filter@IMGREL ; SEH-NEXT: .[[ENTRY]] .LBB0_[[CATCH2]]@IMGREL ; SEH-NEXT: .Llsda_end0: diff --git a/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll b/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll index 2bd004e..9de79ee 100644 --- a/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll +++ b/llvm/test/CodeGen/WinEH/wineh-reuse-catch-alloca.ll @@ -1,4 +1,5 @@ -; RUN: llc %s --mtriple=x86_64-pc-windows-msvc -o - | FileCheck %s +; RUN: llc %s --mtriple=x86_64-pc-windows-msvc -o - | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: %if aarch64-registered-target %{ llc %s --mtriple=aarch64-pc-windows-msvc -o - | FileCheck %s --check-prefixes=CHECK,ARM64 %} ; Tests the fixed object layouts when two catchpads re-use the same stack ; allocation for this catch objects. @@ -18,27 +19,36 @@ ; } ; ``` -; Minimum stack alloc is 64 bytes, so no change there. ; CHECK-LABEL: calls_boom: -; CHECK: subq $64, %rsp -; CHECK: .seh_stackalloc 64 +; Minimum stack alloc is 64 bytes, so no change there. +; X64: subq $64, %rsp +; X64: .seh_stackalloc 64 +; Only need 48 bytes on the stack, not 64. +; ARM64: sub sp, sp, #48 +; ARM64: .seh_stackalloc 48 ; Both the catch blocks load from the same address. ; CHECK-LABEL: "?catch$3@?0?calls_boom@4HA": -; CHECK: movq -8(%rbp), %rax +; X64: movq -8(%rbp), %rax +; ARM64: ldr x8, [x29, #24] ; CHECK-LABEL: "?catch$4@?0?calls_boom@4HA": -; CHECK: movq -8(%rbp), %rax +; X64: movq -8(%rbp), %rax +; ARM64: ldr x8, [x29, #24] -; There's enough space for the UnwindHelp to be at 48 instead of 40 ; CHECK-LABEL: $cppxdata$calls_boom: -; CHECK: .long 48 # UnwindHelp +; There's enough space for the UnwindHelp to be at 48 instead of 40 +; X64: .long 48 # UnwindHelp +; There's enough space for the UnwindHelp to be at -16 instead of -32 +; ARM64: .word -16 // UnwindHelp ; Both catches have the same object offset. ; CHECK-LABEL: $handlerMap$0$calls_boom: -; CHECK: .long 56 # CatchObjOffset -; CHECK-NEXT: .long "?catch$3@?0?calls_boom@4HA"@IMGREL # Handler -; CHECK: .long 56 # CatchObjOffset -; CHECK-NEXT: .long "?catch$4@?0?calls_boom@4HA"@IMGREL # Handler +; X64: .long 56 # CatchObjOffset +; ARM64: .word -8 // CatchObjOffset +; CHECK-NEXT: "?catch$3@?0?calls_boom@4HA"@IMGREL +; X64: .long 56 # CatchObjOffset +; ARM64: .word -8 // CatchObjOffset +; CHECK-NEXT: "?catch$4@?0?calls_boom@4HA"@IMGREL %rtti.TypeDescriptor2 = type { ptr, ptr, [3 x i8] } diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll index 2911edf..d9064c6 100644 --- a/llvm/test/CodeGen/X86/abds-neg.ll +++ b/llvm/test/CodeGen/X86/abds-neg.ll @@ -1076,15 +1076,15 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: movl %edx, %eax -; X86-NEXT: subl %esi, %eax -; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -1107,15 +1107,15 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: xorl %edx, %esi +; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: movl %edx, %eax -; X86-NEXT: subl %esi, %eax -; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -1142,32 +1142,32 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 36(%ebp), %eax ; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %edi ; X86-NEXT: movl 28(%ebp), %edx -; X86-NEXT: movl 24(%ebp), %esi -; X86-NEXT: subl 40(%ebp), %esi +; X86-NEXT: subl 40(%ebp), %edi ; X86-NEXT: sbbl 44(%ebp), %edx ; X86-NEXT: sbbl 48(%ebp), %ecx ; X86-NEXT: sbbl 52(%ebp), %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: xorl %edi, %eax -; X86-NEXT: xorl %edi, %ecx -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: xorl %edi, %esi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: subl %esi, %ebx -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: xorl %esi, %edx +; X86-NEXT: xorl %esi, %edi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: subl %edi, %ebx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl %esi, %edx ; X86-NEXT: sbbl %ecx, %edx -; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: sbbl %eax, %esi ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1203,32 +1203,32 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp -; X86-NEXT: movl 36(%ebp), %eax ; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %edi ; X86-NEXT: movl 28(%ebp), %edx -; X86-NEXT: movl 24(%ebp), %esi -; X86-NEXT: subl 40(%ebp), %esi +; X86-NEXT: subl 40(%ebp), %edi ; X86-NEXT: sbbl 44(%ebp), %edx ; X86-NEXT: sbbl 48(%ebp), %ecx ; X86-NEXT: sbbl 52(%ebp), %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: xorl %edi, %eax -; X86-NEXT: xorl %edi, %ecx -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: xorl %edi, %esi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: subl %esi, %ebx -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: xorl %esi, %edx +; X86-NEXT: xorl %esi, %edi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: subl %edi, %ebx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl %esi, %edx ; X86-NEXT: sbbl %ecx, %edx -; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: sbbl %eax, %esi ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 217cceb..0de308a 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: not_avg_v16i8_wide_constants: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps (%rdi), %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movd %eax, %xmm4 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm5 @@ -1762,6 +1762,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm8 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax +; SSE2-NEXT: movd %eax, %xmm10 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm9 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax @@ -1771,9 +1774,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm12 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm13 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax @@ -1783,43 +1783,45 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm15 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE2-NEXT: movapd %xmm4, %xmm5 ; SSE2-NEXT: andpd %xmm1, %xmm5 ; SSE2-NEXT: xorpd %xmm4, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: paddw %xmm5, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; SSE2-NEXT: movapd %xmm0, %xmm3 -; SSE2-NEXT: andpd %xmm2, %xmm3 -; SSE2-NEXT: xorpd %xmm0, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: paddw %xmm3, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE2-NEXT: movapd %xmm2, %xmm3 +; SSE2-NEXT: andpd %xmm0, %xmm3 +; SSE2-NEXT: xorpd %xmm2, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 +; SSE2-NEXT: paddw %xmm3, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm0, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; @@ -1829,74 +1831,75 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-NEXT: vpextrd $2, %xmm5, %ecx -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX1-NEXT: vpextrd $2, %xmm4, %eax -; AVX1-NEXT: vpextrw $3, %xmm3, %edx +; AVX1-NEXT: vpextrw $7, %xmm3, %edx +; AVX1-NEXT: vpextrw $6, %xmm3, %ecx +; AVX1-NEXT: vpextrw $5, %xmm3, %eax ; AVX1-NEXT: decl %edx ; AVX1-NEXT: vmovd %edx, %xmm4 -; AVX1-NEXT: vpextrw $2, %xmm3, %edx -; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm5 -; AVX1-NEXT: vpextrw $1, %xmm3, %edx -; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm6 -; AVX1-NEXT: vpextrw $0, %xmm3, %edx +; AVX1-NEXT: vpextrw $4, %xmm3, %edx +; AVX1-NEXT: decl %ecx +; AVX1-NEXT: vmovd %ecx, %xmm5 +; AVX1-NEXT: vpextrw $1, %xmm3, %ecx +; AVX1-NEXT: decl %eax +; AVX1-NEXT: vmovd %eax, %xmm6 +; AVX1-NEXT: vpextrw $0, %xmm3, %eax ; AVX1-NEXT: decl %edx ; AVX1-NEXT: vmovd %edx, %xmm7 -; AVX1-NEXT: vpextrw $3, %xmm2, %edx -; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm8 -; AVX1-NEXT: vpextrw $2, %xmm2, %edx +; AVX1-NEXT: vpextrw $3, %xmm3, %edx +; AVX1-NEXT: decq %rcx +; AVX1-NEXT: vmovq %rcx, %xmm8 +; AVX1-NEXT: vpextrw $2, %xmm3, %ecx +; AVX1-NEXT: decq %rax +; AVX1-NEXT: vmovq %rax, %xmm3 +; AVX1-NEXT: vpextrw $7, %xmm2, %eax ; AVX1-NEXT: decl %edx ; AVX1-NEXT: vmovd %edx, %xmm9 -; AVX1-NEXT: vpextrw $1, %xmm2, %edx -; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm10 -; AVX1-NEXT: vpextrw $0, %xmm2, %edx -; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm11 -; AVX1-NEXT: vpextrw $5, %xmm3, %edx +; AVX1-NEXT: vpextrw $6, %xmm2, %edx +; AVX1-NEXT: decl %ecx +; AVX1-NEXT: vmovd %ecx, %xmm10 +; AVX1-NEXT: vpextrw $5, %xmm2, %ecx +; AVX1-NEXT: decl %eax +; AVX1-NEXT: vmovd %eax, %xmm11 +; AVX1-NEXT: vpextrw $4, %xmm2, %eax ; AVX1-NEXT: decl %edx ; AVX1-NEXT: vmovd %edx, %xmm12 -; AVX1-NEXT: vpextrw $4, %xmm3, %edx -; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm13 -; AVX1-NEXT: vpextrw $5, %xmm2, %edx -; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm14 -; AVX1-NEXT: vpextrw $4, %xmm2, %edx -; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm15 -; AVX1-NEXT: vpextrw $7, %xmm3, %edx +; AVX1-NEXT: vpextrw $1, %xmm2, %edx ; AVX1-NEXT: decl %ecx -; AVX1-NEXT: vmovd %ecx, %xmm3 -; AVX1-NEXT: vpextrw $7, %xmm2, %ecx -; AVX1-NEXT: decl %edx -; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vmovd %ecx, %xmm13 +; AVX1-NEXT: vpextrw $0, %xmm2, %ecx +; AVX1-NEXT: decl %eax +; AVX1-NEXT: vmovd %eax, %xmm14 +; AVX1-NEXT: vpextrw $3, %xmm2, %eax +; AVX1-NEXT: decq %rdx +; AVX1-NEXT: vmovq %rdx, %xmm15 +; AVX1-NEXT: vpextrw $2, %xmm2, %edx +; AVX1-NEXT: decq %rcx +; AVX1-NEXT: vmovq %rcx, %xmm2 ; AVX1-NEXT: decl %eax ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX1-NEXT: vmovd %eax, %xmm5 -; AVX1-NEXT: decl %ecx +; AVX1-NEXT: decl %edx ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-NEXT: vmovd %ecx, %xmm7 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-NEXT: vmovddup {{.*#+}} ymm3 = ymm6[0,0,2,2] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-NEXT: vmovd %edx, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm1 ; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll index ab9fa22..24d3030 100644 --- a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll +++ b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll @@ -48,6 +48,6 @@ return: ; preds = %catch, %entry ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 ; CHECK-NEXT: .Llsda_begin0: ; CHECK-NEXT: .long .Ltmp0@IMGREL -; CHECK-NEXT: .long .Ltmp1@IMGREL+1 +; CHECK-NEXT: .long .Ltmp1@IMGREL ; CHECK-NEXT: .long 1 ; CHECK-NEXT: .long .LBB0_[[catch]]@IMGREL diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll index c4c194e..7855ff2 100644 --- a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll +++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll @@ -121,7 +121,6 @@ define void @f_non_leaf(i32 %x, i32 %y) !prof !14 { ; WIN64-NEXT: # encoding: [0xeb,A] ; WIN64-NEXT: # fixup A - offset: 1, value: foo, kind: FK_PCRel_1 ; WIN64-NEXT: .LBB1_2: # %bb2 -; WIN64-NEXT: nop # encoding: [0x90] ; WIN64-NEXT: .seh_startepilogue ; WIN64-NEXT: popq %rbx # encoding: [0x5b] ; WIN64-NEXT: .seh_endepilogue diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll index 9c1d830..2859a87 100644 --- a/llvm/test/CodeGen/X86/conditional-tailcall.ll +++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll @@ -121,7 +121,6 @@ define void @f_non_leaf(i32 %x, i32 %y) optsize { ; WIN64-NEXT: # encoding: [0xeb,A] ; WIN64-NEXT: # fixup A - offset: 1, value: foo, kind: FK_PCRel_1 ; WIN64-NEXT: .LBB1_2: # %bb2 -; WIN64-NEXT: nop # encoding: [0x90] ; WIN64-NEXT: .seh_startepilogue ; WIN64-NEXT: popq %rbx # encoding: [0x5b] ; WIN64-NEXT: .seh_endepilogue diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 661e7bb..455b72d 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -172,10 +172,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: movl 52(%ebp), %esi +; X86-NEXT: movl %esi, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %ecx, %esi ; X86-NEXT: xorl %edx, %esi ; X86-NEXT: movl 48(%ebp), %ecx ; X86-NEXT: xorl %edx, %ecx @@ -204,45 +203,45 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %edx +; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: addl $32, %edx -; X86-NEXT: bsrl %esi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %ecx ; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx ; X86-NEXT: testl %esi, %esi -; X86-NEXT: cmovel %edx, %ecx +; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bsrl %edi, %edi ; X86-NEXT: xorl $31, %edi -; X86-NEXT: addl $32, %edi +; X86-NEXT: orl $32, %edi ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %edi -; X86-NEXT: addl $64, %edi +; X86-NEXT: orl $64, %edi ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %esi, %edx ; X86-NEXT: cmovnel %ecx, %edi -; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: xorl $31, %edx -; X86-NEXT: addl $32, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: bsrl %eax, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx ; X86-NEXT: testl %eax, %eax -; X86-NEXT: cmovel %edx, %ecx +; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: bsrl %ebx, %esi ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx -; X86-NEXT: addl $32, %edx +; X86-NEXT: orl $32, %edx ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: addl $64, %edx +; X86-NEXT: orl $64, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: orl %eax, %esi ; X86-NEXT: cmovnel %ecx, %edx @@ -380,9 +379,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $-1, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 370e1c6..859e924 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -173,17 +173,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl 48(%ebp), %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: addl $32, %ecx +; X86-NEXT: orl $32, %ecx ; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl %ebx, %eax ; X86-NEXT: xorl $31, %eax -; X86-NEXT: addl $32, %eax +; X86-NEXT: orl $32, %eax ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %eax -; X86-NEXT: addl $64, %eax +; X86-NEXT: orl $64, %eax ; X86-NEXT: movl 48(%ebp), %edx ; X86-NEXT: orl %esi, %edx ; X86-NEXT: cmovnel %ecx, %eax @@ -193,7 +193,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl 32(%ebp), %ecx ; X86-NEXT: bsrl %ecx, %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: addl $32, %ecx +; X86-NEXT: orl $32, %ecx ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: movl 28(%ebp), %edi @@ -201,10 +201,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl 24(%ebp), %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: addl $32, %edx +; X86-NEXT: orl $32, %edx ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: addl $64, %edx +; X86-NEXT: orl $64, %edx ; X86-NEXT: movl 32(%ebp), %esi ; X86-NEXT: orl %ebx, %esi ; X86-NEXT: cmovnel %ecx, %edx diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll index 0f66d42..953a5e7 100644 --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: vmovdqa (%ecx), %xmm0 -; X86-NEXT: vpand (%edx), %xmm0, %xmm0 +; X86-NEXT: vmovdqa (%edx), %xmm0 +; X86-NEXT: vpand (%ecx), %xmm0, %xmm0 ; X86-NEXT: vpextrb $6, %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: freeze_extractelement: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rsi), %xmm0 -; X64-NEXT: vpand (%rdi), %xmm0, %xmm0 +; X64-NEXT: vmovdqa (%rdi), %xmm0 +; X64-NEXT: vpand (%rsi), %xmm0, %xmm0 ; X64-NEXT: vpextrb $6, %xmm0, (%rdx) ; X64-NEXT: retq %i0 = load <16 x i8>, ptr %origin0 @@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst, ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: vmovdqa (%edx), %xmm0 -; X86-NEXT: vpand (%esi), %xmm0, %xmm0 +; X86-NEXT: vmovdqa (%esi), %xmm0 +; X86-NEXT: vpand (%edx), %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%ecx) ; X86-NEXT: vpextrb $6, %xmm0, (%eax) ; X86-NEXT: popl %esi @@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst, ; ; X64-LABEL: freeze_extractelement_escape: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rsi), %xmm0 -; X64-NEXT: vpand (%rdi), %xmm0, %xmm0 +; X64-NEXT: vmovdqa (%rdi), %xmm0 +; X64-NEXT: vpand (%rsi), %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rcx) ; X64-NEXT: vpextrb $6, %xmm0, (%rdx) ; X64-NEXT: retq @@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id ; X86-NEXT: movl 32(%ebp), %edx ; X86-NEXT: movl 12(%ebp), %esi ; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: vmovaps (%esi), %xmm0 -; X86-NEXT: vandps (%edi), %xmm0, %xmm0 +; X86-NEXT: vmovaps (%edi), %xmm0 +; X86-NEXT: vandps (%esi), %xmm0, %xmm0 ; X86-NEXT: vmovaps %xmm0, (%esp) ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: cmpb (%esp,%eax), %cl @@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id ; X64: # %bb.0: ; X64-NEXT: andl $15, %ecx ; X64-NEXT: andl $15, %edx -; X64-NEXT: vmovaps (%rsi), %xmm0 -; X64-NEXT: vandps (%rdi), %xmm0, %xmm0 +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: vandps (%rsi), %xmm0, %xmm0 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movzbl -24(%rsp,%rdx), %eax ; X64-NEXT: cmpb -24(%rsp,%rcx), %al diff --git a/llvm/test/CodeGen/X86/noreturn-call-win64.ll b/llvm/test/CodeGen/X86/noreturn-call-win64.ll index 57aa022..13be1f13 100644 --- a/llvm/test/CodeGen/X86/noreturn-call-win64.ll +++ b/llvm/test/CodeGen/X86/noreturn-call-win64.ll @@ -111,3 +111,15 @@ declare dso_local void @"??1MakeCleanup@@QEAA@XZ"(ptr) ; CHECK: # %unreachable ; CHECK: int3 ; CHECK: .seh_handlerdata + + +define dso_local void @last_call_no_return() { + call void @abort1() + unreachable +} + +; CHECK-LABEL: last_call_no_return: +; CHECK: callq abort1 +; CHECK-NEXT: int3 +; CHECK-NEXT: .seh_endproc + diff --git a/llvm/test/CodeGen/X86/peephole-copy.mir b/llvm/test/CodeGen/X86/peephole-copy.mir index e24abf84..f399398 100644 --- a/llvm/test/CodeGen/X86/peephole-copy.mir +++ b/llvm/test/CodeGen/X86/peephole-copy.mir @@ -22,14 +22,14 @@ body: | bb.0: ; CHECK-LABEL: name: c ; CHECK: [[MOV32ri:%[0-9]+]]:gr32_abcd = MOV32ri 512 - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df ; CHECK-NEXT: [[MOV32ri1:%[0-9]+]]:gr32_abcd = MOV32ri 512 - ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df ; CHECK-NEXT: RET 0 %2 = MOV32ri 512 %0 = COPY %2 - INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df + INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df %1 = COPY %2 - INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df + INLINEASM &"", 1 /* sideeffect attdialect */, 3211273 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df RET 0 ... diff --git a/llvm/test/CodeGen/X86/pr149841.ll b/llvm/test/CodeGen/X86/pr149841.ll new file mode 100644 index 0000000..c17a617 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr149841.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.bar = type { [5 x ptr] } + +@global = external dso_local global %struct.bar + +define i1 @foo(ptr %arg, i1 %arg1) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: cmpq $global+1, %rdi +; CHECK-NEXT: setne %al +; CHECK-NEXT: andb %sil, %al +; CHECK-NEXT: retq +bb: + #dbg_value(ptr @global, !3, !DIExpression(), !5) + %icmp = icmp ne ptr %arg, getelementptr inbounds nuw (i8, ptr @global, i64 1) + %select = select i1 %arg1, i1 %icmp, i1 false + ret i1 %select +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug) +!1 = !DIFile(filename: "x.c", directory: "/proc/self/cwd") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1) +!4 = distinct !DISubprogram(name: "x", scope: null, file: !1, spFlags: DISPFlagDefinition, unit: !0) +!5 = !DILocation(line: 0, scope: !4) + diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll index 2d1b7fc..9728e13 100644 --- a/llvm/test/CodeGen/X86/pr62286.ll +++ b/llvm/test/CodeGen/X86/pr62286.ll @@ -42,10 +42,10 @@ define i64 @PR62286(i32 %a) { ; AVX2-LABEL: PR62286: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll b/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll new file mode 100644 index 0000000..841061c --- /dev/null +++ b/llvm/test/CodeGen/X86/pseudo-probe-desc-check.ll @@ -0,0 +1,47 @@ +; REQUIRES: asserts +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s -o /dev/null 2>&1 | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-windows-msvc < %s -o /dev/null 2>&1 | FileCheck %s + +; CHECK: warning: Guid:8314849053352128226 Name:inlinee does not exist in pseudo probe desc +; CHECK: warning: Guid:6492337042787843907 Name:extract2 does not exist in pseudo probe desc + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" + +define void @extract1() !dbg !8 { +entry: + call void @llvm.pseudoprobe(i64 6028998432455395745, i64 1, i32 0, i64 -1), !dbg !11 + call void @llvm.pseudoprobe(i64 8314849053352128226, i64 1, i32 0, i64 -1), !dbg !12 + ret void, !dbg !16 +} + +define void @extract2() !dbg !17 { +entry: + call void @llvm.pseudoprobe(i64 6492337042787843907, i64 1, i32 0, i64 -1), !dbg !18 + ret void, !dbg !18 +} + +declare void @llvm.pseudoprobe(i64, i64, i32, i64) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6} +!llvm.pseudo_probe_desc = !{!7} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/home/foo") +!2 = !{i32 7, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 7, !"uwtable", i32 2} +!6 = !{i32 7, !"frame-pointer", i32 2} +!7 = !{i64 6028998432455395745, i64 281479271677951, !"extract1"} +!8 = distinct !DISubprogram(name: "extract1", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, spFlags: DISPFlagDefinition, unit: !0) +!9 = !DISubroutineType(types: !10) +!10 = !{} +!11 = !DILocation(line: 5, column: 3, scope: !8) +!12 = !DILocation(line: 2, column: 1, scope: !13, inlinedAt: !14) +!13 = distinct !DISubprogram(name: "inlinee", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0) +!14 = distinct !DILocation(line: 5, column: 3, scope: !15) +!15 = !DILexicalBlockFile(scope: !8, file: !1, discriminator: 455082007) +!16 = !DILocation(line: 6, column: 1, scope: !8) +!17 = distinct !DISubprogram(name: "extract2", scope: !1, file: !1, line: 8, type: !9, scopeLine: 8, spFlags: DISPFlagDefinition, unit: !0) +!18 = !DILocation(line: 9, column: 1, scope: !17) diff --git a/llvm/test/CodeGen/X86/seh-catch-all.ll b/llvm/test/CodeGen/X86/seh-catch-all.ll index 5250bb9..4e25aab 100644 --- a/llvm/test/CodeGen/X86/seh-catch-all.ll +++ b/llvm/test/CodeGen/X86/seh-catch-all.ll @@ -40,7 +40,7 @@ catchall: ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 ; CHECK-NEXT: .Llsda_begin0: ; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL -; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL+1 +; CHECK-NEXT: .long .Ltmp{{[0-9]+}}@IMGREL ; CHECK-NEXT: .long 1 ; CHECK-NEXT: .long .LBB0_2@IMGREL ; CHECK-NEXT: .Llsda_end0: diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll index d958580..cb85f39 100644 --- a/llvm/test/CodeGen/X86/seh-catchpad.ll +++ b/llvm/test/CodeGen/X86/seh-catchpad.ll @@ -123,23 +123,23 @@ __except.ret: ; preds = %catch.dispatch.7 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 ; CHECK-NEXT: .Llsda_begin0: ; CHECK-NEXT: .long .Ltmp0@IMGREL -; CHECK-NEXT: .long .Ltmp1@IMGREL+1 +; CHECK-NEXT: .long .Ltmp1@IMGREL ; CHECK-NEXT: .long 1 ; CHECK-NEXT: .long .LBB1_[[except1bb]]@IMGREL ; CHECK-NEXT: .long .Ltmp0@IMGREL -; CHECK-NEXT: .long .Ltmp1@IMGREL+1 +; CHECK-NEXT: .long .Ltmp1@IMGREL ; CHECK-NEXT: .long "?filt$0@0@main@@"@IMGREL ; CHECK-NEXT: .long .LBB1_[[except2bb]]@IMGREL ; CHECK-NEXT: .long .Ltmp2@IMGREL -; CHECK-NEXT: .long .Ltmp3@IMGREL+1 +; CHECK-NEXT: .long .Ltmp3@IMGREL ; CHECK-NEXT: .long "?dtor$[[finbb:[0-9]+]]@?0?main@4HA"@IMGREL ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .long .Ltmp2@IMGREL -; CHECK-NEXT: .long .Ltmp3@IMGREL+1 +; CHECK-NEXT: .long .Ltmp3@IMGREL ; CHECK-NEXT: .long "?filt$0@0@main@@"@IMGREL ; CHECK-NEXT: .long .LBB1_3@IMGREL ; CHECK-NEXT: .long .Ltmp6@IMGREL -; CHECK-NEXT: .long .Ltmp7@IMGREL+1 +; CHECK-NEXT: .long .Ltmp7@IMGREL ; CHECK-NEXT: .long "?filt$0@0@main@@"@IMGREL ; CHECK-NEXT: .long .LBB1_3@IMGREL ; CHECK-NEXT: .Llsda_end0: diff --git a/llvm/test/CodeGen/X86/seh-except-finally.ll b/llvm/test/CodeGen/X86/seh-except-finally.ll index 7f70655..539d776 100644 --- a/llvm/test/CodeGen/X86/seh-except-finally.ll +++ b/llvm/test/CodeGen/X86/seh-except-finally.ll @@ -83,15 +83,15 @@ __try.cont: ; preds = %__except, %invoke.c ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 ; CHECK-NEXT: .Llsda_begin0: ; CHECK-NEXT: .long .Ltmp0@IMGREL -; CHECK-NEXT: .long .Ltmp1@IMGREL+1 +; CHECK-NEXT: .long .Ltmp1@IMGREL ; CHECK-NEXT: .long "?dtor$2@?0?use_both@4HA"@IMGREL ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .long .Ltmp0@IMGREL -; CHECK-NEXT: .long .Ltmp1@IMGREL+1 +; CHECK-NEXT: .long .Ltmp1@IMGREL ; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL ; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL ; CHECK-NEXT: .long .Ltmp4@IMGREL -; CHECK-NEXT: .long .Ltmp5@IMGREL+1 +; CHECK-NEXT: .long .Ltmp5@IMGREL ; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL ; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL ; CHECK-NEXT: .Llsda_end0: diff --git a/llvm/test/CodeGen/X86/seh-finally.ll b/llvm/test/CodeGen/X86/seh-finally.ll index 41823df..6093e5e 100644 --- a/llvm/test/CodeGen/X86/seh-finally.ll +++ b/llvm/test/CodeGen/X86/seh-finally.ll @@ -30,7 +30,7 @@ lpad: ; preds = %entry ; X64-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites ; X64-NEXT: .Llsda_begin0: ; X64-NEXT: .long .Ltmp0@IMGREL # LabelStart -; X64-NEXT: .long .Ltmp1@IMGREL+1 # LabelEnd +; X64-NEXT: .long .Ltmp1@IMGREL # LabelEnd ; X64-NEXT: .long "?dtor$2@?0?main@4HA"@IMGREL # FinallyFunclet ; X64-NEXT: .long 0 # Null ; X64-NEXT: .Llsda_end0: diff --git a/llvm/test/CodeGen/X86/seh-safe-div.ll b/llvm/test/CodeGen/X86/seh-safe-div.ll index 542d9f6..20169f8 100644 --- a/llvm/test/CodeGen/X86/seh-safe-div.ll +++ b/llvm/test/CodeGen/X86/seh-safe-div.ll @@ -60,6 +60,7 @@ __try.cont: ; CHECK: .Ltmp0: ; CHECK: leaq [[rloc:.*\(%rbp\)]], %rcx ; CHECK: callq try_body +; CHECK: nop ; CHECK-NEXT: .Ltmp1 ; CHECK: [[cont_bb:\.LBB0_[0-9]+]]: ; CHECK: movl [[rloc]], %eax @@ -82,11 +83,11 @@ __try.cont: ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 ; CHECK-NEXT: .Llsda_begin0: ; CHECK-NEXT: .long .Ltmp0@IMGREL -; CHECK-NEXT: .long .Ltmp1@IMGREL+1 +; CHECK-NEXT: .long .Ltmp1@IMGREL ; CHECK-NEXT: .long safe_div_filt0@IMGREL ; CHECK-NEXT: .long [[handler0]]@IMGREL ; CHECK-NEXT: .long .Ltmp0@IMGREL -; CHECK-NEXT: .long .Ltmp1@IMGREL+1 +; CHECK-NEXT: .long .Ltmp1@IMGREL ; CHECK-NEXT: .long safe_div_filt1@IMGREL ; CHECK-NEXT: .long [[handler1]]@IMGREL ; CHECK-NEXT: .Llsda_end0: diff --git a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll index 2c576df..5a6aeb6 100644 --- a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll +++ b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll @@ -56,8 +56,8 @@ declare dso_local void @printf(ptr, ...) ; CHECK-NEXT:$ip2state$test: ; CHECK-NEXT: .long .Lfunc_begin0@IMGREL # IP ; CHECK-NEXT: .long -1 # ToState -; CHECK-NEXT: .long .Ltmp0@IMGREL+1 # IP +; CHECK-NEXT: .long .Ltmp0@IMGREL # IP ; CHECK-NEXT: .long 0 # ToState -; CHECK-NEXT: .long .Ltmp1@IMGREL+1 # IP +; CHECK-NEXT: .long .Ltmp1@IMGREL # IP ; CHECK-NEXT: .long -1 # ToState diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll index d273d09..c7cf9cb 100644 --- a/llvm/test/CodeGen/X86/select-optimize.ll +++ b/llvm/test/CodeGen/X86/select-optimize.ll @@ -229,9 +229,10 @@ define i32 @expensive_val_operand4(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) { } ; Expensive cold value operand with unsafe-to-sink (due to lifetime-end marker) load (partial slice sinking). -define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) { +define i32 @expensive_val_operand5(i32 %b, i32 %y, i1 %cmp) { ; CHECK-LABEL: @expensive_val_operand5( -; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[A]], align 8 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[A]]) ; CHECK-NEXT: [[CMP_FROZEN:%.*]] = freeze i1 [[CMP:%.*]] ; CHECK-NEXT: br i1 [[CMP_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]] @@ -242,6 +243,7 @@ define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) { ; CHECK-NEXT: [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ] ; CHECK-NEXT: ret i32 [[SEL]] ; + %a = alloca i32 %load = load i32, ptr %a, align 8 call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %a) %x = add i32 %load, %b diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll index d2b292f..2ac2be5 100644 --- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll +++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll @@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-AVX2-NEXT: .LBB0_2: # %vector.body ; CHECK-AVX2-NEXT: # Parent Loop BB0_1 Depth=1 ; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %ymm5 -; CHECK-AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5 +; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %rdi ; CHECK-AVX2-NEXT: vpextrq $1, %xmm6, %r8 ; CHECK-AVX2-NEXT: vmovq %xmm5, %r9 diff --git a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll index e2de2ff..74fe07e 100644 --- a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll +++ b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll @@ -84,12 +84,12 @@ define void @pr66984(ptr %arg) personality ptr @__CxxFrameHandler3 { ; X86_64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X86_64-NEXT: .Ltmp0: ; X86_64-NEXT: callq throw +; X86_64-NEXT: nop ; X86_64-NEXT: .Ltmp1: ; X86_64-NEXT: # %bb.1: # %bb14 ; X86_64-NEXT: .LBB0_3: # Block address taken ; X86_64-NEXT: # %exit ; X86_64-NEXT: $ehgcr_0_3: -; X86_64-NEXT: nop ; X86_64-NEXT: .seh_startepilogue ; X86_64-NEXT: addq $64, %rsp ; X86_64-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/swap.ll b/llvm/test/CodeGen/X86/swap.ll index e556900..1dc454dd 100644 --- a/llvm/test/CodeGen/X86/swap.ll +++ b/llvm/test/CodeGen/X86/swap.ll @@ -47,12 +47,10 @@ define dso_local void @onealloc_noreadback(ptr nocapture %a, ptr nocapture %b) l entry: %alloc = alloca [16 x i8], i8 2, align 1 %part2 = getelementptr inbounds [16 x i8], ptr %alloc, i64 1, i64 0 - call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %alloc) - call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %part2) + call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %alloc) call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %alloc, ptr align 1 %a, i64 16, i1 false) tail call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 1 %part2, ptr align 1 %b, i64 16, i1 false) - call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %alloc) - call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %part2) + call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %alloc) ret void } @@ -115,8 +113,9 @@ define dso_local void @onealloc_readback_1(ptr nocapture %a, ptr nocapture %b) l ; ; AA-LABEL: onealloc_readback_1: ; AA: # %bb.0: # %entry -; AA-NEXT: vmovups (%rsi), %xmm0 +; AA-NEXT: vmovups (%rdi), %xmm0 ; AA-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AA-NEXT: vmovups (%rsi), %xmm0 ; AA-NEXT: vmovups %xmm0, (%rdi) ; AA-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/taildup-heapallocsite.ll b/llvm/test/CodeGen/X86/taildup-heapallocsite.ll index 967e125..f3bef47 100644 --- a/llvm/test/CodeGen/X86/taildup-heapallocsite.ll +++ b/llvm/test/CodeGen/X86/taildup-heapallocsite.ll @@ -37,9 +37,11 @@ cond.end: ; preds = %entry, %cond.true ; CHECK: testq ; CHECK: je ; CHECK: callq alloc +; CHECK-NEXT: nop ; CHECK-NEXT: [[L1:.Ltmp[0-9]+]] ; CHECK: jmp f2 # TAILCALL ; CHECK: callq alloc +; CHECK-NEXT: nop ; CHECK-NEXT: [[L3:.Ltmp[0-9]+]] ; CHECK: jmp f2 # TAILCALL diff --git a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll index bfb9c43..0bf8370 100644 --- a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll +++ b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll @@ -103,15 +103,15 @@ handler2: ; X64: $ip2state$try_in_catch: ; X64-NEXT: .long .Lfunc_begin0@IMGREL ; X64-NEXT: .long -1 -; X64-NEXT: .long .Ltmp0@IMGREL+1 +; X64-NEXT: .long .Ltmp0@IMGREL ; X64-NEXT: .long 0 -; X64-NEXT: .long .Ltmp1@IMGREL+1 +; X64-NEXT: .long .Ltmp1@IMGREL ; X64-NEXT: .long -1 ; X64-NEXT: .long "?catch$2@?0?try_in_catch@4HA"@IMGREL ; X64-NEXT: .long 1 -; X64-NEXT: .long .Ltmp2@IMGREL+1 +; X64-NEXT: .long .Ltmp2@IMGREL ; X64-NEXT: .long 2 -; X64-NEXT: .long .Ltmp3@IMGREL+1 +; X64-NEXT: .long .Ltmp3@IMGREL ; X64-NEXT: .long 1 ; X64-NEXT: .long "?catch$4@?0?try_in_catch@4HA"@IMGREL ; X64-NEXT: .long 3 diff --git a/llvm/test/CodeGen/X86/win-catchpad.ll b/llvm/test/CodeGen/X86/win-catchpad.ll index 2491946..62ea510 100644 --- a/llvm/test/CodeGen/X86/win-catchpad.ll +++ b/llvm/test/CodeGen/X86/win-catchpad.ll @@ -214,9 +214,9 @@ try.cont: ; X64: $ip2state$try_catch_catch: ; X64-NEXT: .long .Lfunc_begin0@IMGREL ; X64-NEXT: .long -1 -; X64-NEXT: .long .Ltmp0@IMGREL+1 +; X64-NEXT: .long .Ltmp0@IMGREL ; X64-NEXT: .long 0 -; X64-NEXT: .long .Ltmp1@IMGREL+1 +; X64-NEXT: .long .Ltmp1@IMGREL ; X64-NEXT: .long -1 ; X64-NEXT: .long "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"@IMGREL ; X64-NEXT: .long 1 @@ -357,9 +357,9 @@ try.cont: ; X64-LABEL: $ip2state$branch_to_normal_dest: ; X64-NEXT: .long .Lfunc_begin1@IMGREL ; X64-NEXT: .long -1 -; X64-NEXT: .long .Ltmp[[before_call]]@IMGREL+1 +; X64-NEXT: .long .Ltmp[[before_call]]@IMGREL ; X64-NEXT: .long 0 -; X64-NEXT: .long .Ltmp[[after_call]]@IMGREL+1 +; X64-NEXT: .long .Ltmp[[after_call]]@IMGREL ; X64-NEXT: .long -1 ; X64-NEXT: .long "?catch$[[catchbb]]@?0?branch_to_normal_dest@4HA"@IMGREL ; X64-NEXT: .long 1 diff --git a/llvm/test/CodeGen/X86/win-cleanuppad.ll b/llvm/test/CodeGen/X86/win-cleanuppad.ll index e3f7f5b..e9265a1 100644 --- a/llvm/test/CodeGen/X86/win-cleanuppad.ll +++ b/llvm/test/CodeGen/X86/win-cleanuppad.ll @@ -191,7 +191,7 @@ cleanup.outer: ; preds = %invoke.cont.1, %c ; X64-NEXT: .long 1 ; X64-NEXT: .long .Ltmp6@IMGREL ; X64-NEXT: .long 0 -; X64-NEXT: .long .Ltmp7@IMGREL+1 +; X64-NEXT: .long .Ltmp7@IMGREL ; X64-NEXT: .long -1 attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/X86/win32-eh-states.ll b/llvm/test/CodeGen/X86/win32-eh-states.ll index 42ae5b0..e645199 100644 --- a/llvm/test/CodeGen/X86/win32-eh-states.ll +++ b/llvm/test/CodeGen/X86/win32-eh-states.ll @@ -86,11 +86,11 @@ catch.7: ; X64-LABEL: $ip2state$f: ; X64-NEXT: .long .Lfunc_begin0@IMGREL ; X64-NEXT: .long -1 -; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1 +; X64-NEXT: .long .Ltmp{{.*}}@IMGREL ; X64-NEXT: .long 0 -; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1 +; X64-NEXT: .long .Ltmp{{.*}}@IMGREL ; X64-NEXT: .long 1 -; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1 +; X64-NEXT: .long .Ltmp{{.*}}@IMGREL ; X64-NEXT: .long -1 ; X64-NEXT: .long "?catch${{.*}}@?0?f@4HA"@IMGREL ; X64-NEXT: .long 2 @@ -189,15 +189,15 @@ unreachable: ; preds = %entry ; X64-LABEL: $ip2state$g: ; X64-NEXT: .long .Lfunc_begin1@IMGREL ; X64-NEXT: .long -1 -; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1 +; X64-NEXT: .long .Ltmp{{.*}}@IMGREL ; X64-NEXT: .long 1 -; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1 +; X64-NEXT: .long .Ltmp{{.*}}@IMGREL ; X64-NEXT: .long -1 ; X64-NEXT: .long "?catch${{.*}}@?0?g@4HA"@IMGREL ; X64-NEXT: .long 2 -; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1 +; X64-NEXT: .long .Ltmp{{.*}}@IMGREL ; X64-NEXT: .long 3 -; X64-NEXT: .long .Ltmp{{.*}}@IMGREL+1 +; X64-NEXT: .long .Ltmp{{.*}}@IMGREL ; X64-NEXT: .long 2 diff --git a/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll b/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll index bc5be7a..75f156f 100644 --- a/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll +++ b/llvm/test/CodeGen/X86/win64-seh-epilogue-statepoint.ll @@ -8,8 +8,8 @@ define i32 @foobar() gc "statepoint-example" personality ptr @__gxx_personality_ ; CHECK-NEXT: .seh_stackalloc 40 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: callq bar -; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: nop +; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: .seh_endepilogue diff --git a/llvm/test/CodeGen/X86/wineh-coreclr.ll b/llvm/test/CodeGen/X86/wineh-coreclr.ll index baf5eaa..a3d0fde 100644 --- a/llvm/test/CodeGen/X86/wineh-coreclr.ll +++ b/llvm/test/CodeGen/X86/wineh-coreclr.ll @@ -38,6 +38,7 @@ entry: ; CHECK: [[test1_before_f1:.+]]: ; CHECK-NEXT: movl $1, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test1_after_f1:.+]]: invoke void @f(i32 1) to label %inner_try unwind label %finally @@ -46,6 +47,7 @@ inner_try: ; CHECK: [[test1_before_f2:.+]]: ; CHECK-NEXT: movl $2, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test1_after_f2:.+]]: invoke void @f(i32 2) to label %finally.clone unwind label %exn.dispatch @@ -69,6 +71,7 @@ catch1: ; CHECK: [[test1_before_f3:.+]]: ; CHECK-NEXT: movl $3, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test1_after_f3:.+]]: invoke void @f(i32 3) [ "funclet"(token %catch.pad1) ] to label %catch1.ret unwind label %finally @@ -92,6 +95,7 @@ catch2: ; CHECK: [[test1_before_f4:.+]]: ; CHECK-NEXT: movl $4, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test1_after_f4:.+]]: invoke void @f(i32 4) [ "funclet"(token %catch.pad2) ] to label %try_in_catch unwind label %finally @@ -100,6 +104,7 @@ try_in_catch: ; CHECK: [[test1_before_f5:.+]]: ; CHECK-NEXT: movl $5, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test1_after_f5:.+]]: invoke void @f(i32 5) [ "funclet"(token %catch.pad2) ] to label %catch2.ret unwind label %fault @@ -116,6 +121,7 @@ fault: ; CHECK: [[test1_before_f6:.+]]: ; CHECK-NEXT: movl $6, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test1_after_f6:.+]]: invoke void @f(i32 6) [ "funclet"(token %fault.pad) ] to label %fault.ret unwind label %finally @@ -312,6 +318,7 @@ unreachable: ; CHECK: [[test2_before_f1:.+]]: ; CHECK-NEXT: movl $1, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test2_after_f1:.+]]: ; CHECK: .seh_proc [[test2_catch1:[^ ]+]] ; CHECK: .seh_proc [[test2_catch2:[^ ]+]] @@ -320,6 +327,7 @@ unreachable: ; CHECK: [[test2_before_f2:.+]]: ; CHECK-NEXT: movl $2, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test2_after_f2:.+]]: ; CHECK: int3 ; CHECK: [[test2_end:.*func_end.*]]: @@ -448,6 +456,7 @@ entry: ; CHECK: [[test3_before_f1:.+]]: ; CHECK-NEXT: movl $1, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test3_after_f1:.+]]: invoke void @f(i32 1) to label %exit unwind label %fault1 @@ -474,6 +483,7 @@ fault4: ; CHECK: [[test3_before_f6:.+]]: ; CHECK-NEXT: movl $6, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test3_after_f6:.+]]: invoke void @f(i32 6) ["funclet"(token %fault.pad4)] to label %fault4.cont unwind label %exn.dispatch1 @@ -482,6 +492,7 @@ fault4.cont: ; CHECK: [[test3_before_f7:.+]]: ; CHECK-NEXT: movl $7, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test3_after_f7:.+]]: invoke void @f(i32 7) ["funclet"(token %fault.pad4)] to label %unreachable unwind label %fault5 @@ -512,6 +523,7 @@ unreachable: ; CHECK: [[test3_before_f4:.+]]: ; CHECK-NEXT: movl $4, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test3_after_f4:.+]]: ; CHECK: int3 ; CHECK: .seh_proc [[test3_fault2:[^ ]+]] @@ -520,6 +532,7 @@ unreachable: ; CHECK: [[test3_before_f3:.+]]: ; CHECK-NEXT: movl $3, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test3_after_f3:.+]]: ; CHECK: int3 ; CHECK: .seh_proc [[test3_fault1:[^ ]+]] @@ -528,6 +541,7 @@ unreachable: ; CHECK: [[test3_before_f2:.+]]: ; CHECK-NEXT: movl $2, %ecx ; CHECK-NEXT: callq f +; CHECK-NEXT: nop ; CHECK-NEXT: [[test3_after_f2:.+]]: ; CHECK: int3 ; CHECK: [[test3_end:.*func_end.*]]: diff --git a/llvm/test/CodeGen/XCore/exception.ll b/llvm/test/CodeGen/XCore/exception.ll index f222297..bb5f3f4 100644 --- a/llvm/test/CodeGen/XCore/exception.ll +++ b/llvm/test/CodeGen/XCore/exception.ll @@ -60,7 +60,7 @@ entry: ; CHECK: [[PRE_G:.L[a-zA-Z0-9_]+]] ; CHECK: bl g ; CHECK: [[POST_G:.L[a-zA-Z0-9_]+]] -; CHECK: [[RETURN:.L[a-zA-Z0-9_]+]] +; CHECK: [[RETURN:^.L[a-zA-Z0-9_]+]] ; CHECK: ldw r6, sp[1] ; CHECK: ldw r5, sp[2] ; CHECK: ldw r4, sp[3] |