diff options
Diffstat (limited to 'llvm/test/CodeGen')
105 files changed, 2445 insertions, 3377 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll index 4932529..3007e7c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -8,9 +8,8 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) { ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %tmp1 = load <4 x double>, ptr %ptr %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16> @@ -26,13 +25,10 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) { ; CHECK-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-NEXT: fcvtzs v3.2d, v3.2d ; CHECK-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: xtn v3.2s, v3.2d -; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h -; CHECK-NEXT: uzp1 v1.4h, v2.4h, v3.4h -; CHECK-NEXT: uzp1 v0.8b, v1.8b, v0.8b +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret %tmp1 = load <8 x double>, ptr %ptr %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8> @@ -96,9 +92,8 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(ptr %ptr) { ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %tmp1 = load <4 x double>, ptr %ptr %tmp2 = fptoui <4 x double> %tmp1 to <4 x i16> diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index bccfdb9..9ebd570 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -59,7 +59,7 @@ define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){ ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret @@ -388,7 +388,7 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: add x8, sp, #12 -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: str s0, [sp, #12] ; CHECK-NEXT: ld1 { v0.h }[0], [x8] ; CHECK-NEXT: orr x8, x8, #0x2 diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index 1f68c77..dff4831 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -650,7 +650,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: add x11, x3, #12 ; CHECK-NEXT: str s1, [x4] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ldp s0, s5, [x2] +; CHECK-NEXT: ldp s0, s4, [x2] ; CHECK-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-NEXT: umov w9, v2.h[0] ; CHECK-NEXT: umov w10, v2.h[1] @@ -662,24 +662,25 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-NEXT: mov v0.b[10], w9 ; CHECK-NEXT: add x9, x1, #4 -; CHECK-NEXT: uzp1 v1.8b, v1.8b, v2.8b +; CHECK-NEXT: mov v1.d[1], v2.d[0] ; CHECK-NEXT: mov v0.b[11], w10 ; CHECK-NEXT: add x10, x1, #12 +; CHECK-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4 -; CHECK-NEXT: ldr s4, [x0, #12] -; CHECK-NEXT: ldp s3, s16, [x0, #4] -; CHECK-NEXT: ld1 { v5.s }[1], [x3] -; CHECK-NEXT: ldp s6, s7, [x2, #8] -; CHECK-NEXT: ld1 { v4.s }[1], [x10] -; CHECK-NEXT: ld1 { v3.s }[1], [x9] -; CHECK-NEXT: ld1 { v6.s }[1], [x8] -; CHECK-NEXT: ld1 { v7.s }[1], [x11] +; CHECK-NEXT: ldr s3, [x0, #12] +; CHECK-NEXT: ldp s2, s7, [x0, #4] +; CHECK-NEXT: ld1 { v4.s }[1], [x3] +; CHECK-NEXT: ldp s5, s6, [x2, #8] +; CHECK-NEXT: ld1 { v3.s }[1], [x10] +; CHECK-NEXT: ld1 { v2.s }[1], [x9] +; CHECK-NEXT: ld1 { v5.s }[1], [x8] +; CHECK-NEXT: ld1 { v6.s }[1], [x11] ; CHECK-NEXT: add x8, x1, #8 -; CHECK-NEXT: ld1 { v16.s }[1], [x8] -; CHECK-NEXT: uaddl v2.8h, v3.8b, v4.8b -; CHECK-NEXT: ushll v3.8h, v6.8b, #0 -; CHECK-NEXT: uaddl v4.8h, v5.8b, v7.8b -; CHECK-NEXT: uaddl v1.8h, v1.8b, v16.8b +; CHECK-NEXT: ld1 { v7.s }[1], [x8] +; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: ushll v3.8h, v5.8b, #0 +; CHECK-NEXT: uaddl v4.8h, v4.8b, v6.8b +; CHECK-NEXT: uaddw v1.8h, v1.8h, v7.8b ; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b ; CHECK-NEXT: ushll v0.4s, v2.4h, #3 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 diff --git a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll index 897d35a..8de0f0d 100644 --- a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll +++ b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll @@ -131,7 +131,7 @@ define i32 @f7() { ; GISEL-NEXT: ret entry: - %lshr = lshr i128 bitcast (<2 x i64> <i64 undef, i64 ptrtoint (ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @x3, i64 0, inrange i32 1, i64 2) to i64)> to i128), 64 + %lshr = lshr i128 bitcast (<2 x i64> <i64 undef, i64 ptrtoint (ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @x3, i64 0, i32 1, i64 2) to i64)> to i128), 64 %trunc = trunc i128 %lshr to i64 %inttoptr = inttoptr i64 %trunc to ptr %gep = getelementptr i32, ptr %inttoptr, i64 5 diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll index 1ea87bb..0a3b9a0 100644 --- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll @@ -73,9 +73,8 @@ define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) { ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: fcvtzs.4s v1, v1 ; CHECK-NEXT: fcvtzs.4s v0, v0 -; CHECK-NEXT: xtn.4h v1, v1 -; CHECK-NEXT: xtn.4h v0, v0 -; CHECK-NEXT: uzp1.8b v0, v0, v1 +; CHECK-NEXT: uzp1.8h v0, v0, v1 +; CHECK-NEXT: xtn.8b v0, v0 ; CHECK-NEXT: str d0, [x1] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll index 67190e8..7af01b5 100644 --- a/llvm/test/CodeGen/AArch64/fptoi.ll +++ b/llvm/test/CodeGen/AArch64/fptoi.ll @@ -1096,30 +1096,17 @@ entry: } define <3 x i16> @fptos_v3f64_v3i16(<3 x double> %a) { -; CHECK-SD-LABEL: fptos_v3f64_v3i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d -; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fptos_v3f64_v3i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] -; CHECK-GI-NEXT: fcvtzs v1.2d, v2.2d -; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fptos_v3f64_v3i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret entry: %c = fptosi <3 x double> %a to <3 x i16> ret <3 x i16> %c @@ -1134,9 +1121,8 @@ define <3 x i16> @fptou_v3f64_v3i16(<3 x double> %a) { ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptou_v3f64_v3i16: @@ -1160,9 +1146,8 @@ define <4 x i16> @fptos_v4f64_v4i16(<4 x double> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptos_v4f64_v4i16: @@ -1182,9 +1167,8 @@ define <4 x i16> @fptou_v4f64_v4i16(<4 x double> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptou_v4f64_v4i16: @@ -1600,9 +1584,8 @@ define <3 x i8> @fptos_v3f64_v3i8(<3 x double> %a) { ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s ; CHECK-SD-NEXT: umov w0, v0.h[0] ; CHECK-SD-NEXT: umov w1, v0.h[1] ; CHECK-SD-NEXT: umov w2, v0.h[2] @@ -1638,9 +1621,8 @@ define <3 x i8> @fptou_v3f64_v3i8(<3 x double> %a) { ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NEXT: fcvtzs v1.2d, v2.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s ; CHECK-SD-NEXT: umov w0, v0.h[0] ; CHECK-SD-NEXT: umov w1, v0.h[1] ; CHECK-SD-NEXT: umov w2, v0.h[2] @@ -1672,9 +1654,8 @@ define <4 x i8> @fptos_v4f64_v4i8(<4 x double> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptos_v4f64_v4i8: @@ -1694,9 +1675,8 @@ define <4 x i8> @fptou_v4f64_v4i8(<4 x double> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: xtn v0.4h, v0.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptou_v4f64_v4i8: @@ -1718,13 +1698,10 @@ define <8 x i8> @fptos_v8f64_v8i8(<8 x double> %a) { ; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d ; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v3.2s, v3.2d -; CHECK-SD-NEXT: xtn v2.2s, v2.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v2.8b +; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: xtn v0.8b, v0.8h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptos_v8f64_v8i8: @@ -1750,13 +1727,10 @@ define <8 x i8> @fptou_v8f64_v8i8(<8 x double> %a) { ; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d ; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v3.2s, v3.2d -; CHECK-SD-NEXT: xtn v2.2s, v2.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v2.8b +; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: xtn v0.8b, v0.8h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptou_v8f64_v8i8: @@ -1786,21 +1760,13 @@ define <16 x i8> @fptos_v16f64_v16i8(<16 x double> %a) { ; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d ; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v7.2s, v7.2d -; CHECK-SD-NEXT: xtn v6.2s, v6.2d -; CHECK-SD-NEXT: xtn v5.2s, v5.2d -; CHECK-SD-NEXT: xtn v4.2s, v4.2d -; CHECK-SD-NEXT: xtn v3.2s, v3.2d -; CHECK-SD-NEXT: xtn v2.2s, v2.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h -; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h -; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: mov v4.d[1], v6.d[0] -; CHECK-SD-NEXT: mov v0.d[1], v2.d[0] -; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b +; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s +; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: uzp1 v1.8h, v4.8h, v6.8h +; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptos_v16f64_v16i8: @@ -1837,21 +1803,13 @@ define <16 x i8> @fptou_v16f64_v16i8(<16 x double> %a) { ; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d ; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-SD-NEXT: xtn v7.2s, v7.2d -; CHECK-SD-NEXT: xtn v6.2s, v6.2d -; CHECK-SD-NEXT: xtn v5.2s, v5.2d -; CHECK-SD-NEXT: xtn v4.2s, v4.2d -; CHECK-SD-NEXT: xtn v3.2s, v3.2d -; CHECK-SD-NEXT: xtn v2.2s, v2.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h -; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h -; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: mov v4.d[1], v6.d[0] -; CHECK-SD-NEXT: mov v0.d[1], v2.d[0] -; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b +; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s +; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: uzp1 v1.8h, v4.8h, v6.8h +; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptou_v16f64_v16i8: @@ -1900,36 +1858,20 @@ define <32 x i8> @fptos_v32f64_v32i8(<32 x double> %a) { ; CHECK-SD-NEXT: fcvtzs v18.2d, v18.2d ; CHECK-SD-NEXT: fcvtzs v17.2d, v17.2d ; CHECK-SD-NEXT: fcvtzs v16.2d, v16.2d -; CHECK-SD-NEXT: xtn v7.2s, v7.2d -; CHECK-SD-NEXT: xtn v6.2s, v6.2d -; CHECK-SD-NEXT: xtn v5.2s, v5.2d -; CHECK-SD-NEXT: xtn v4.2s, v4.2d -; CHECK-SD-NEXT: xtn v3.2s, v3.2d -; CHECK-SD-NEXT: xtn v2.2s, v2.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: xtn v23.2s, v23.2d -; CHECK-SD-NEXT: xtn v22.2s, v22.2d -; CHECK-SD-NEXT: xtn v21.2s, v21.2d -; CHECK-SD-NEXT: xtn v20.2s, v20.2d -; CHECK-SD-NEXT: xtn v19.2s, v19.2d -; CHECK-SD-NEXT: xtn v18.2s, v18.2d -; CHECK-SD-NEXT: xtn v17.2s, v17.2d -; CHECK-SD-NEXT: xtn v16.2s, v16.2d -; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h -; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h -; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: uzp1 v1.4h, v22.4h, v23.4h -; CHECK-SD-NEXT: uzp1 v3.4h, v20.4h, v21.4h -; CHECK-SD-NEXT: uzp1 v5.4h, v18.4h, v19.4h -; CHECK-SD-NEXT: uzp1 v7.4h, v16.4h, v17.4h -; CHECK-SD-NEXT: mov v4.d[1], v6.d[0] -; CHECK-SD-NEXT: mov v0.d[1], v2.d[0] -; CHECK-SD-NEXT: mov v3.d[1], v1.d[0] -; CHECK-SD-NEXT: mov v7.d[1], v5.d[0] +; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s +; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: uzp1 v3.4s, v20.4s, v21.4s +; CHECK-SD-NEXT: uzp1 v1.4s, v22.4s, v23.4s +; CHECK-SD-NEXT: uzp1 v5.4s, v18.4s, v19.4s +; CHECK-SD-NEXT: uzp1 v7.4s, v16.4s, v17.4s +; CHECK-SD-NEXT: uzp1 v4.8h, v4.8h, v6.8h +; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: uzp1 v1.8h, v3.8h, v1.8h +; CHECK-SD-NEXT: uzp1 v2.8h, v7.8h, v5.8h ; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b -; CHECK-SD-NEXT: uzp1 v1.16b, v7.16b, v3.16b +; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptos_v32f64_v32i8: @@ -1997,36 +1939,20 @@ define <32 x i8> @fptou_v32f64_v32i8(<32 x double> %a) { ; CHECK-SD-NEXT: fcvtzs v18.2d, v18.2d ; CHECK-SD-NEXT: fcvtzs v17.2d, v17.2d ; CHECK-SD-NEXT: fcvtzs v16.2d, v16.2d -; CHECK-SD-NEXT: xtn v7.2s, v7.2d -; CHECK-SD-NEXT: xtn v6.2s, v6.2d -; CHECK-SD-NEXT: xtn v5.2s, v5.2d -; CHECK-SD-NEXT: xtn v4.2s, v4.2d -; CHECK-SD-NEXT: xtn v3.2s, v3.2d -; CHECK-SD-NEXT: xtn v2.2s, v2.2d -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: xtn v0.2s, v0.2d -; CHECK-SD-NEXT: xtn v23.2s, v23.2d -; CHECK-SD-NEXT: xtn v22.2s, v22.2d -; CHECK-SD-NEXT: xtn v21.2s, v21.2d -; CHECK-SD-NEXT: xtn v20.2s, v20.2d -; CHECK-SD-NEXT: xtn v19.2s, v19.2d -; CHECK-SD-NEXT: xtn v18.2s, v18.2d -; CHECK-SD-NEXT: xtn v17.2s, v17.2d -; CHECK-SD-NEXT: xtn v16.2s, v16.2d -; CHECK-SD-NEXT: uzp1 v6.4h, v6.4h, v7.4h -; CHECK-SD-NEXT: uzp1 v4.4h, v4.4h, v5.4h -; CHECK-SD-NEXT: uzp1 v2.4h, v2.4h, v3.4h -; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: uzp1 v1.4h, v22.4h, v23.4h -; CHECK-SD-NEXT: uzp1 v3.4h, v20.4h, v21.4h -; CHECK-SD-NEXT: uzp1 v5.4h, v18.4h, v19.4h -; CHECK-SD-NEXT: uzp1 v7.4h, v16.4h, v17.4h -; CHECK-SD-NEXT: mov v4.d[1], v6.d[0] -; CHECK-SD-NEXT: mov v0.d[1], v2.d[0] -; CHECK-SD-NEXT: mov v3.d[1], v1.d[0] -; CHECK-SD-NEXT: mov v7.d[1], v5.d[0] +; CHECK-SD-NEXT: uzp1 v6.4s, v6.4s, v7.4s +; CHECK-SD-NEXT: uzp1 v4.4s, v4.4s, v5.4s +; CHECK-SD-NEXT: uzp1 v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: uzp1 v3.4s, v20.4s, v21.4s +; CHECK-SD-NEXT: uzp1 v1.4s, v22.4s, v23.4s +; CHECK-SD-NEXT: uzp1 v5.4s, v18.4s, v19.4s +; CHECK-SD-NEXT: uzp1 v7.4s, v16.4s, v17.4s +; CHECK-SD-NEXT: uzp1 v4.8h, v4.8h, v6.8h +; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-SD-NEXT: uzp1 v1.8h, v3.8h, v1.8h +; CHECK-SD-NEXT: uzp1 v2.8h, v7.8h, v5.8h ; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v4.16b -; CHECK-SD-NEXT: uzp1 v1.16b, v7.16b, v3.16b +; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptou_v32f64_v32i8: @@ -3026,9 +2952,8 @@ define <8 x i8> @fptos_v8f32_v8i8(<8 x float> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fcvtzs v1.4s, v1.4s ; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-SD-NEXT: xtn v1.4h, v1.4s -; CHECK-SD-NEXT: xtn v0.4h, v0.4s -; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: xtn v0.8b, v0.8h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptos_v8f32_v8i8: @@ -3048,9 +2973,8 @@ define <8 x i8> @fptou_v8f32_v8i8(<8 x float> %a) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fcvtzs v1.4s, v1.4s ; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-SD-NEXT: xtn v1.4h, v1.4s -; CHECK-SD-NEXT: xtn v0.4h, v0.4s -; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: xtn v0.8b, v0.8h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptou_v8f32_v8i8: @@ -3072,12 +2996,8 @@ define <16 x i8> @fptos_v16f32_v16i8(<16 x float> %a) { ; CHECK-SD-NEXT: fcvtzs v2.4s, v2.4s ; CHECK-SD-NEXT: fcvtzs v1.4s, v1.4s ; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-SD-NEXT: xtn v3.4h, v3.4s -; CHECK-SD-NEXT: xtn v2.4h, v2.4s -; CHECK-SD-NEXT: xtn v1.4h, v1.4s -; CHECK-SD-NEXT: xtn v0.4h, v0.4s -; CHECK-SD-NEXT: mov v2.d[1], v3.d[0] -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v2.16b ; CHECK-SD-NEXT: ret ; @@ -3134,20 +3054,12 @@ define <32 x i8> @fptos_v32f32_v32i8(<32 x float> %a) { ; CHECK-SD-NEXT: fcvtzs v6.4s, v6.4s ; CHECK-SD-NEXT: fcvtzs v5.4s, v5.4s ; CHECK-SD-NEXT: fcvtzs v4.4s, v4.4s -; CHECK-SD-NEXT: xtn v3.4h, v3.4s -; CHECK-SD-NEXT: xtn v2.4h, v2.4s -; CHECK-SD-NEXT: xtn v1.4h, v1.4s -; CHECK-SD-NEXT: xtn v0.4h, v0.4s -; CHECK-SD-NEXT: xtn v7.4h, v7.4s -; CHECK-SD-NEXT: xtn v6.4h, v6.4s -; CHECK-SD-NEXT: xtn v5.4h, v5.4s -; CHECK-SD-NEXT: xtn v4.4h, v4.4s -; CHECK-SD-NEXT: mov v2.d[1], v3.d[0] -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: mov v6.d[1], v7.d[0] -; CHECK-SD-NEXT: mov v4.d[1], v5.d[0] +; CHECK-SD-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; CHECK-SD-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: uzp1 v1.8h, v6.8h, v7.8h +; CHECK-SD-NEXT: uzp1 v3.8h, v4.8h, v5.8h ; CHECK-SD-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: uzp1 v1.16b, v4.16b, v6.16b +; CHECK-SD-NEXT: uzp1 v1.16b, v3.16b, v1.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptos_v32f32_v32i8: diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll index b677d077..5d78ad2 100644 --- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll +++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll @@ -104,7 +104,7 @@ define void @v4i32_v4i8(<4 x i32> %a, ptr %result) { ; CHECK-LABEL: v4i32_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret %b = trunc <4 x i32> %a to <4 x i8> @@ -170,8 +170,7 @@ define void @v2i16_v2i8(<2 x i16> %a, ptr %result) { define void @v4i16_v4i8(<4 x i16> %a, ptr %result) { ; CHECK-LABEL: v4i16_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret %b = trunc <4 x i16> %a to <4 x i8> diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index 5f905d9..6f1ae02 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -145,7 +145,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret %x = load <4 x i8>, ptr %px diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll index 0ef6478..fb571ef 100644 --- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll +++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll @@ -353,13 +353,17 @@ define <8 x i8> @shuffle4_v8i8_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { ; CHECK-LABEL: shuffle4_v4i8_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b -; CHECK-NEXT: uzp1 v1.8b, v2.8b, v3.8b +; CHECK-NEXT: fmov d5, d2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3 ; CHECK-NEXT: adrp x8, .LCPI8_0 -; CHECK-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-NEXT: fmov d4, d0 ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b +; CHECK-NEXT: mov v4.d[1], v1.d[0] +; CHECK-NEXT: mov v5.d[1], v3.d[0] +; CHECK-NEXT: bic v4.8h, #255, lsl #8 +; CHECK-NEXT: bic v5.8h, #255, lsl #8 +; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b }, v0.16b ; CHECK-NEXT: ret %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index d79f3ae..b1131f2 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -202,7 +202,7 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){ ; CHECK-SD-NEXT: ext v0.8b, v1.8b, v0.8b, #6 ; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v0.4h ; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4 -; CHECK-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret @@ -390,7 +390,7 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){ ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: dup v0.4h, v0.h[0] -; CHECK-SD-NEXT: xtn v0.8b, v0.8h +; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme-write-vg.ll b/llvm/test/CodeGen/AArch64/sme-write-vg.ll new file mode 100644 index 0000000..577606d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-write-vg.ll @@ -0,0 +1,24 @@ +; RUN: llc -mattr=+sme -stop-after=finalize-isel < %s | FileCheck %s + +target triple = "aarch64" + +; Check that we don't define VG for 'smstart za' and 'smstop za' +define void @smstart_za() "aarch64_new_za" nounwind { + ; CHECK-LABEL: name: smstart_za + ; CHECK-NOT: implicit-def {{[^,]*}}$vg + ret void +} + +; Check that we do define VG for 'smstart sm' and 'smstop sm' +define void @smstart_sm() nounwind { + ; CHECK-LABEL: name: smstart_sm + ; CHECK: MSRpstatesvcrImm1 1, 1, + ; CHECK-SAME: implicit-def {{[^,]*}}$vg + ; CHECK: MSRpstatesvcrImm1 1, 0, + ; CHECK-SAME: implicit-def {{[^,]*}}$vg + call void @require_sm() + ret void +} + +declare void @require_sm() "aarch64_pstate_sm_enabled" +declare void @require_za() "aarch64_inout_za" diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index acec3e7..d1f843a 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -146,7 +146,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret %x = load <4 x i8>, ptr %px diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index 4f8a4f7..0ad9900 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -41,8 +41,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: fcvtzs v2.4s, v2.4s ; CHECK-NEXT: xtn v1.4h, v1.4s ; CHECK-NEXT: xtn v2.4h, v2.4s -; CHECK-NEXT: xtn v1.8b, v1.8h -; CHECK-NEXT: xtn v2.8b, v2.8h +; CHECK-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-NEXT: uzp1 v2.8b, v2.8b, v0.8b ; CHECK-NEXT: mov v1.s[1], v2.s[0] ; CHECK-NEXT: stur d1, [x12, #-4] ; CHECK-NEXT: add x12, x12, #8 diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll index ba367b0..18cd4cc 100644 --- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -710,23 +710,23 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-NEXT: LBB6_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp q4, q1, [x0, #48] -; CHECK-NEXT: add x9, x1, #8 -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: subs x8, x8, #1 +; CHECK-NEXT: add x9, x1, #10 ; CHECK-NEXT: ldr d0, [x0, #80] +; CHECK-NEXT: ldp q3, q2, [x0] ; CHECK-NEXT: ldr q5, [x0, #32] +; CHECK-NEXT: subs x8, x8, #1 ; CHECK-NEXT: add x0, x0, #128 -; CHECK-NEXT: uzp1.4s v4, v5, v4 -; CHECK-NEXT: uzp1.4s v2, v3, v2 ; CHECK-NEXT: uzp1.4s v0, v1, v0 -; CHECK-NEXT: uzp1.8h v1, v2, v4 +; CHECK-NEXT: uzp1.4s v1, v5, v4 +; CHECK-NEXT: uzp1.4s v2, v3, v2 ; CHECK-NEXT: xtn.4h v0, v0 -; CHECK-NEXT: uzp1.16b v1, v1, v0 -; CHECK-NEXT: xtn.8b v0, v0 -; CHECK-NEXT: st1.h { v1 }[4], [x9] -; CHECK-NEXT: add x9, x1, #10 -; CHECK-NEXT: st1.b { v0 }[2], [x9] -; CHECK-NEXT: str d1, [x1], #16 +; CHECK-NEXT: uzp1.8h v1, v2, v1 +; CHECK-NEXT: uzp1.8b v2, v0, v0 +; CHECK-NEXT: uzp1.16b v0, v1, v0 +; CHECK-NEXT: st1.b { v2 }[2], [x9] +; CHECK-NEXT: add x9, x1, #8 +; CHECK-NEXT: st1.h { v0 }[4], [x9] +; CHECK-NEXT: str d0, [x1], #16 ; CHECK-NEXT: b.eq LBB6_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret @@ -755,7 +755,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-BE-NEXT: xtn v0.4h, v0.4s ; CHECK-BE-NEXT: uzp1 v1.8h, v1.8h, v2.8h ; CHECK-BE-NEXT: uzp1 v1.16b, v1.16b, v0.16b -; CHECK-BE-NEXT: xtn v0.8b, v0.8h +; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-BE-NEXT: rev16 v2.16b, v1.16b ; CHECK-BE-NEXT: rev64 v1.16b, v1.16b ; CHECK-BE-NEXT: st1 { v0.b }[2], [x9] @@ -790,7 +790,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { ; CHECK-DISABLE-NEXT: xtn v0.4h, v0.4s ; CHECK-DISABLE-NEXT: uzp1 v1.8h, v1.8h, v2.8h ; CHECK-DISABLE-NEXT: uzp1 v1.16b, v1.16b, v0.16b -; CHECK-DISABLE-NEXT: xtn v0.8b, v0.8h +; CHECK-DISABLE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-DISABLE-NEXT: rev16 v2.16b, v1.16b ; CHECK-DISABLE-NEXT: rev64 v1.16b, v1.16b ; CHECK-DISABLE-NEXT: st1 { v0.b }[2], [x9] diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index e05c65d..f0bbed5 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -142,7 +142,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-NEXT: movi d0, #0xff00ff00ff00ff ; CHECK-NEXT: uaddl v1.8h, v1.8b, v2.8b ; CHECK-NEXT: umin v0.4h, v1.4h, v0.4h -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret %x = load <4 x i8>, ptr %px diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index 05f43e7..82c0327 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -143,7 +143,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret %x = load <4 x i8>, ptr %px diff --git a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll index 380bdbc..6119405 100644 --- a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll +++ b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll @@ -9,9 +9,8 @@ define <8 x i8> @float_to_i8(ptr %in) { ; CHECK-NEXT: fadd v0.4s, v0.4s, v0.4s ; CHECK-NEXT: fcvtzs v0.4s, v0.4s ; CHECK-NEXT: fcvtzs v1.4s, v1.4s -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: xtn v1.4h, v1.4s -; CHECK-NEXT: uzp1 v0.8b, v1.8b, v0.8b +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret %l = load <8 x float>, ptr %in %scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0> diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll index 9c6ab8d..dd7a9c6 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll @@ -210,7 +210,7 @@ define void @no_combine_for_non_bool_truncate(<4 x i32> %vec, ptr %out) { ; CHECK-LABEL: no_combine_for_non_bool_truncate: ; CHECK: ; %bb.0: ; CHECK-NEXT: xtn.4h v0, v0 -; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: uzp1.8b v0, v0, v0 ; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 90328f7..71d55df 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -410,7 +410,7 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) { ; BE-NEXT: ldrh w8, [x0, #4] ; BE-NEXT: rev32 v0.4h, v0.4h ; BE-NEXT: mov v0.h[2], w8 -; BE-NEXT: xtn v0.8b, v0.8h +; BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; BE-NEXT: rev32 v0.16b, v0.16b ; BE-NEXT: str s0, [sp, #12] ; BE-NEXT: ldrh w9, [sp, #12] @@ -456,7 +456,7 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; BE-NEXT: add x8, x8, :lo12:.LCPI11_0 ; BE-NEXT: ld1 { v1.4h }, [x8] ; BE-NEXT: add v0.4h, v0.4h, v1.4h -; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: umov w8, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] @@ -638,7 +638,7 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { ; BE-NEXT: .cfi_def_cfa_offset 16 ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 -; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: umov w8, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] @@ -672,7 +672,7 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { ; BE-NEXT: .cfi_def_cfa_offset 16 ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 -; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: umov w8, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] @@ -706,7 +706,7 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { ; BE-NEXT: .cfi_def_cfa_offset 16 ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 -; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: umov w8, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] @@ -741,7 +741,7 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { ; BE-NEXT: .cfi_def_cfa_offset 16 ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 -; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: umov w8, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] @@ -777,7 +777,7 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { ; BE-NEXT: .cfi_def_cfa_offset 16 ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 -; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: umov w8, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] @@ -801,7 +801,7 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: shrn.4h v0, v0, #16 -; CHECK-NEXT: xtn.8b v1, v0 +; CHECK-NEXT: uzp1.8b v1, v0, v0 ; CHECK-NEXT: umov.h w8, v0[2] ; CHECK-NEXT: str s1, [sp, #12] ; CHECK-NEXT: ldrh w9, [sp, #12] @@ -816,7 +816,7 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { ; BE-NEXT: .cfi_def_cfa_offset 16 ; BE-NEXT: ld1 { v0.4s }, [x0] ; BE-NEXT: shrn v0.4h, v0.4s, #16 -; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: umov w8, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #12] @@ -868,7 +868,7 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) { ; BE-NEXT: ushll v0.8h, v0.8b, #0 ; BE-NEXT: ld1 { v0.b }[4], [x9] ; BE-NEXT: add v0.4h, v0.4h, v1.4h -; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: umov w8, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #8] @@ -921,7 +921,7 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) { ; BE-NEXT: ushll v0.8h, v0.8b, #0 ; BE-NEXT: ld1 { v0.b }[4], [x9] ; BE-NEXT: add v0.4h, v0.4h, v1.4h -; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: umov w8, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b ; BE-NEXT: str s1, [sp, #8] diff --git a/llvm/test/CodeGen/AArch64/xor.ll b/llvm/test/CodeGen/AArch64/xor.ll index d92402c..7d7f7bf 100644 --- a/llvm/test/CodeGen/AArch64/xor.ll +++ b/llvm/test/CodeGen/AArch64/xor.ll @@ -51,7 +51,7 @@ define <4 x i32> @vec_add_of_not_decrement(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: vec_add_of_not_decrement: ; CHECK: // %bb.0: ; CHECK-NEXT: mvn v1.16b, v1.16b -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %x, %y %r = sub <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1> diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index ebb77c1..9865883 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -16968,7 +16968,7 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -16977,7 +16977,7 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -17163,9 +17163,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -17174,9 +17174,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, -1.0, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -17280,8 +17280,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -17293,8 +17291,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -17375,10 +17371,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v1, v1, v3 ; GCN-NEXT: v_min_f32_e32 v0, v0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -17396,10 +17388,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -17522,12 +17510,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v2, v2, v5 ; GCN-NEXT: v_min_f32_e32 v1, v1, v4 ; GCN-NEXT: v_min_f32_e32 v0, v0, v3 @@ -17551,12 +17533,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 @@ -17688,14 +17664,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v3, v3, v7 ; GCN-NEXT: v_min_f32_e32 v2, v2, v6 ; GCN-NEXT: v_min_f32_e32 v1, v1, v5 @@ -17725,14 +17693,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 @@ -17951,22 +17911,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v7, v7, v15 ; GCN-NEXT: v_min_f32_e32 v6, v6, v14 ; GCN-NEXT: v_min_f32_e32 v5, v5, v13 @@ -18020,22 +17964,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v14 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v13 @@ -18382,71 +18310,51 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_min_f32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_min_f32_e32 v13, v13, v29 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_min_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_min_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_min_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_min_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_min_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_min_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_min_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_min_f32_e32 v5, v5, v21 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 @@ -18461,8 +18369,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_min_f32_e32 v4, v4, v20 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -18474,21 +18380,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v3, v3, v19 ; GCN-NEXT: v_min_f32_e32 v2, v2, v18 ; GCN-NEXT: v_min_f32_e32 v1, v1, v17 ; GCN-NEXT: v_min_f32_e32 v0, v0, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -18503,8 +18398,9 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_min_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -18513,14 +18409,12 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 @@ -18531,13 +18425,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -18560,13 +18454,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 @@ -18579,48 +18473,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_min_f32_e32 v15, v15, v25 +; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 @@ -18634,6 +18494,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -19267,287 +19131,223 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_min_f32_e32 v31, v31, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_min_f32_e32 v30, v30, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_min_f32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_min_f32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_min_f32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_min_f32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_min_f32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_min_f32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_min_f32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_min_f32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_min_f32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_min_f32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_min_f32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_min_f32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_min_f32_e32 v17, v17, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_min_f32_e32 v16, v16, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_min_f32_e32 v15, v15, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_min_f32_e32 v14, v14, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_min_f32_e32 v13, v13, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_min_f32_e32 v12, v12, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_min_f32_e32 v11, v11, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_min_f32_e32 v10, v10, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_min_f32_e32 v9, v9, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_min_f32_e32 v8, v8, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_min_f32_e32 v7, v7, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_min_f32_e32 v6, v6, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_min_f32_e32 v5, v5, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_min_f32_e32 v4, v4, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_min_f32_e32 v3, v3, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_min_f32_e32 v2, v2, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_min_f32_e32 v1, v1, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v0, v0, v32 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -19590,322 +19390,258 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_min_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_min_f32_e32 v31, v31, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_min_f32_e32 v30, v30, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v29, v29, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v28, v28, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v27, v27, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v26, v26, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v25, v25, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v24, v24, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v23, v23, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v22, v22, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v21, v21, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v20, v20, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v19, v19, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v18, v18, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v17, v17, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v16, v16, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v32 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21097,8 +20833,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -21110,8 +20844,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21192,10 +20924,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v1, v1, v3 ; GCN-NEXT: v_max_f32_e32 v0, v0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -21213,10 +20941,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -21339,12 +21063,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v2, v2, v5 ; GCN-NEXT: v_max_f32_e32 v1, v1, v4 ; GCN-NEXT: v_max_f32_e32 v0, v0, v3 @@ -21368,12 +21086,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 @@ -21505,14 +21217,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v3, v3, v7 ; GCN-NEXT: v_max_f32_e32 v2, v2, v6 ; GCN-NEXT: v_max_f32_e32 v1, v1, v5 @@ -21542,14 +21246,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 @@ -21768,22 +21464,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v7, v7, v15 ; GCN-NEXT: v_max_f32_e32 v6, v6, v14 ; GCN-NEXT: v_max_f32_e32 v5, v5, v13 @@ -21837,22 +21517,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v14 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v13 @@ -22199,71 +21863,51 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_max_f32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_max_f32_e32 v13, v13, v29 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_max_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_max_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_max_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_max_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_max_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_max_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_max_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_max_f32_e32 v5, v5, v21 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 @@ -22278,8 +21922,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_max_f32_e32 v4, v4, v20 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -22291,21 +21933,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v3, v3, v19 ; GCN-NEXT: v_max_f32_e32 v2, v2, v18 ; GCN-NEXT: v_max_f32_e32 v1, v1, v17 ; GCN-NEXT: v_max_f32_e32 v0, v0, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -22320,8 +21951,9 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_max_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -22330,14 +21962,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 @@ -22348,13 +21978,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -22377,13 +22007,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 @@ -22396,48 +22026,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_max_f32_e32 v15, v15, v25 +; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 @@ -22451,6 +22047,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -23084,287 +22684,223 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_max_f32_e32 v31, v31, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_max_f32_e32 v30, v30, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_max_f32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_max_f32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_max_f32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_max_f32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_max_f32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_max_f32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_max_f32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_max_f32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_max_f32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_max_f32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_max_f32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_max_f32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_max_f32_e32 v17, v17, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_max_f32_e32 v16, v16, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_max_f32_e32 v15, v15, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_max_f32_e32 v14, v14, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_max_f32_e32 v13, v13, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_max_f32_e32 v12, v12, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_max_f32_e32 v11, v11, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_max_f32_e32 v10, v10, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_max_f32_e32 v9, v9, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_max_f32_e32 v8, v8, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_max_f32_e32 v7, v7, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_max_f32_e32 v6, v6, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_max_f32_e32 v5, v5, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_max_f32_e32 v4, v4, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_max_f32_e32 v3, v3, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_max_f32_e32 v2, v2, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_max_f32_e32 v1, v1, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v0, v0, v32 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -23407,322 +22943,258 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_max_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_max_f32_e32 v31, v31, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_max_f32_e32 v30, v30, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v29, v29, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v28, v28, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v27, v27, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v26, v26, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v25, v25, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v24, v24, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v23, v23, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v22, v22, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v21, v21, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v20, v20, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v19, v19, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v18, v18, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v17, v17, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v16, v16, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v32 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -25176,7 +24648,6 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 ; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -26818,11 +26289,17 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GCN-LABEL: v_canonicalize_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_canonicalize_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_canonicalize_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index dfadd8d..9472845 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -2996,18 +2996,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 +; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -3095,16 +3093,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 2.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; GFX6-NEXT: v_max_f32_e32 v2, 2.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -3198,9 +3195,8 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -3760,19 +3756,17 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000 ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_med3_f32 v3, v3, s2, 1.0 +; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -3863,18 +3857,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 +; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index 4ed1b8a..e198197 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -471,25 +471,15 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee ret void } -; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode: -; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GCN-DENORM-NOT: v_max -; GCN-DENORM-NOT: v_mul - -; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; GCN-DENORM-NOT: v_max -; GCN-DENORM-NOT: v_mul - -; GFX9: {{flat|global}}_store_dword -define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 { - %id = tail call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id - %load = load float, ptr addrspace(1) %gep, align 4 - %v = tail call float @llvm.minnum.f32(float %load, float 0.0) - %canonicalized = tail call float @llvm.canonicalize.f32(float %v) - store float %canonicalized, ptr addrspace(1) %gep, align 4 - ret void -} +; define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 { +; %id = tail call i32 @llvm.amdgcn.workitem.id.x() +; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id +; %load = load float, ptr addrspace(1) %gep, align 4 +; %v = tail call float @llvm.minnum.f32(float %load, float 0.0) +; %canonicalized = tail call float @llvm.canonicalize.f32(float %v) +; store float %canonicalized, ptr addrspace(1) %gep, align 4 +; ret void +; } ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32: ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} @@ -523,32 +513,15 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1 ret void } -; GCN-LABEL: test_fold_canonicalize_denorm_value_f32: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] - -; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] -; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]] - -; GFX9-FLUSH: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] -; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]] - -; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]] -; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]] - -; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]] - -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]] -define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) { - %id = tail call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id - %load = load float, ptr addrspace(1) %gep, align 4 - %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float)) - %canonicalized = tail call float @llvm.canonicalize.f32(float %v) - store float %canonicalized, ptr addrspace(1) %gep, align 4 - ret void -} +; define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) { +; %id = tail call i32 @llvm.amdgcn.workitem.id.x() +; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id +; %load = load float, ptr addrspace(1) %gep, align 4 +; %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float)) +; %canonicalized = tail call float @llvm.canonicalize.f32(float %v) +; store float %canonicalized, ptr addrspace(1) %gep, align 4 +; ret void +; } ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] @@ -674,10 +647,9 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrsp } ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16 -; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]], -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_short v{{.+}}, [[V]] +; GCN: {{flat|global}}_load_ushort [[V1:v[0-9]+]], +; GCN: v_max_f16_e32 [[V2:v[0-9]+]], [[V1]], [[V1]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V2]] define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id @@ -807,18 +779,13 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) { ret half %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16: -; GFX9: v_mul_f16_e32 -; GFX9: v_pk_mul_f16 -; GFX9-NOT: v_max -; GFX9-NOT: v_pk_max -define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) { - %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0> - %ins.op = fmul half %val, 8.0 - %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx - %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins) - ret <2 x half> %canonicalized -} +; define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) { +; %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0> +; %ins.op = fmul half %val, 8.0 +; %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx +; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins) +; ret <2 x half> %canonicalized +; } ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16: ; GFX9: v_mul_f16 @@ -842,15 +809,11 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x ret <2 x half> %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz: -; GCN: s_waitcnt -; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 -define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) { - %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) - %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt) - ret <2 x half> %canonicalized -} +; define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) { +; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) +; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt) +; ret <2 x half> %canonicalized +; } ; GCN-LABEL: {{^}}v_test_canonicalize_cubeid: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 27462130..581b7b4 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -94,7 +94,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -147,7 +146,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -170,6 +168,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ret void } +define half @s_test_canonicalize_arg(half %x) #1 { +; VI-LABEL: s_test_canonicalize_arg: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_test_canonicalize_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: s_test_canonicalize_arg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_test_canonicalize_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %canonicalized = call half @llvm.canonicalize.f16(half %x) + ret half %canonicalized +} + define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 { ; VI-LABEL: v_test_canonicalize_build_vector_v2f16: ; VI: ; %bb.0: @@ -242,7 +269,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -299,7 +325,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -357,7 +382,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -414,7 +438,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -471,7 +494,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -1246,9 +1268,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1323,9 +1343,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1404,9 +1422,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1485,9 +1501,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1551,9 +1565,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -2424,7 +2436,6 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 { ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16: @@ -2456,8 +2467,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2738,7 +2748,6 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 ; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: @@ -2782,8 +2791,6 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal ; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: @@ -2826,13 +2833,10 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2878,18 +2882,18 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v6f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v6f16: @@ -2933,22 +2937,22 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v8f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v8f16: @@ -3001,30 +3005,30 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v12f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v12f16: @@ -3087,38 +3091,38 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v16f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v16f16: @@ -3216,68 +3220,68 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; CI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; CI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; CI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 @@ -3456,228 +3460,354 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v10 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v3, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v26 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v4, v5, v4 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v12 ; CI-NEXT: v_or_b32_e32 v5, v7, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v22 ; CI-NEXT: v_or_b32_e32 v6, v7, v6 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v19 ; CI-NEXT: v_or_b32_e32 v7, v9, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v29 -; CI-NEXT: v_or_b32_e32 v8, v9, v8 +; CI-NEXT: v_or_b32_e32 v8, v10, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v20 ; CI-NEXT: v_or_b32_e32 v9, v11, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; CI-NEXT: v_or_b32_e32 v10, v11, v10 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v24 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 +; CI-NEXT: v_or_b32_e32 v10, v12, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v30 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; CI-NEXT: v_or_b32_e32 v11, v13, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v30 -; CI-NEXT: v_or_b32_e32 v12, v13, v12 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; CI-NEXT: v_or_b32_e32 v13, v15, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v29 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; CI-NEXT: v_or_b32_e32 v12, v15, v12 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v31 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v21, v33 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_or_b32_e32 v13, v16, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 ; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v24 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 ; CI-NEXT: v_or_b32_e32 v15, v25, v15 -; CI-NEXT: s_waitcnt vmcnt(11) -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: s_waitcnt vmcnt(10) -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v21 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 +; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v16 +; CI-NEXT: v_or_b32_e32 v16, v24, v25 +; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; CI-NEXT: v_or_b32_e32 v25, v28, v24 ; CI-NEXT: s_waitcnt vmcnt(9) ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_or_b32_e32 v16, v17, v16 -; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; CI-NEXT: v_or_b32_e32 v17, v19, v17 ; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; CI-NEXT: v_or_b32_e32 v20, v19, v20 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v27, v34 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; CI-NEXT: v_or_b32_e32 v17, v17, v26 +; CI-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0 +; CI-NEXT: v_or_b32_e32 v18, v27, v18 +; CI-NEXT: buffer_store_dword v17, v26, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 +; CI-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x74, v0 +; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x70, v0 +; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_or_b32_e32 v18, v19, v18 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; CI-NEXT: v_or_b32_e32 v19, v21, v19 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v27 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v29 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 +; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: s_waitcnt vmcnt(12) +; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; CI-NEXT: v_or_b32_e32 v20, v21, v20 -; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; CI-NEXT: v_or_b32_e32 v21, v27, v21 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_add_i32_e32 v21, vcc, 0x6c, v0 +; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; CI-NEXT: s_waitcnt vmcnt(13) +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: s_waitcnt vmcnt(12) +; CI-NEXT: v_cvt_f16_f32_e32 v23, v24 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; CI-NEXT: v_or_b32_e32 v20, v23, v20 +; CI-NEXT: s_waitcnt vmcnt(9) +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: s_waitcnt vmcnt(4) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; CI-NEXT: v_or_b32_e32 v24, v25, v24 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x7c, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_or_b32_e32 v22, v22, v23 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 -; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_or_b32_e32 v23, v27, v23 +; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0 +; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_or_b32_e32 v17, v17, v18 +; CI-NEXT: v_add_i32_e32 v18, vcc, 0x64, v0 +; CI-NEXT: v_or_b32_e32 v25, v25, v26 +; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0 +; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x5c, v0 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_or_b32_e32 v19, v24, v19 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_or_b32_e32 v21, v22, v21 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: s_waitcnt vmcnt(4) +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v22 +; CI-NEXT: v_or_b32_e32 v22, v23, v27 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 +; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; CI-NEXT: v_or_b32_e32 v23, v28, v23 +; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 +; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x74, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v27 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x70, v0 -; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; CI-NEXT: v_or_b32_e32 v23, v23, v27 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_or_b32_e32 v24, v24, v27 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x6c, v0 -; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 -; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; CI-NEXT: v_or_b32_e32 v27, v28, v27 +; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; CI-NEXT: v_or_b32_e32 v23, v26, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x64, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v23, vcc, 0x5c, v0 -; CI-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v23, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v22, vcc, 0x54, v0 -; CI-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0 -; CI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v21, vcc, 0x4c, v0 -; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 -; CI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 -; CI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 -; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; CI-NEXT: v_or_b32_e32 v28, v29, v28 +; CI-NEXT: buffer_store_dword v28, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v27, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0 +; CI-NEXT: buffer_store_dword v24, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; CI-NEXT: buffer_store_dword v23, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v22, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 +; CI-NEXT: buffer_store_dword v21, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x44, v0 +; CI-NEXT: buffer_store_dword v19, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0 ; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index c1093a1..d53c041 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -2389,7 +2389,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2471,15 +2470,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX6-NEXT: flat_load_dword v0, v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX6-NEXT: flat_store_dword v[0:1], v4 @@ -2724,7 +2721,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2807,15 +2803,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX6-NEXT: flat_load_dword v0, v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX6-NEXT: flat_store_dword v[0:1], v4 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 78fb89c..b32630a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -951,8 +951,6 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1056,7 +1054,6 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1110,7 +1107,6 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1193,7 +1189,6 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,7 +1217,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1253,7 +1247,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1311,7 +1304,6 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1346,7 +1338,6 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1413,8 +1404,6 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1494,8 +1483,6 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1599,7 +1586,6 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1653,7 +1639,6 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1736,7 +1721,6 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1792,7 +1776,6 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1859,8 +1842,6 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3980,7 +3961,8 @@ define half @v_fneg_canonicalize_f16(half %a) #0 { ; SI-LABEL: v_fneg_canonicalize_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_canonicalize_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 17f6761..b5440b9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -1021,7 +1021,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1043,7 +1042,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index ab7ab4d..d056a97 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -32,8 +32,6 @@ define amdgpu_kernel void @maxnum_f16( ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -170,7 +168,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -279,7 +276,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -384,21 +380,17 @@ define amdgpu_kernel void @maxnum_v2f16( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_max_f32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -497,20 +489,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -589,20 +579,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -688,27 +676,21 @@ define amdgpu_kernel void @maxnum_v3f16( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: s_lshr_b32 s3, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: s_lshr_b32 s8, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_max_f32_e32 v2, v3, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, v1, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_max_f32_e32 v0, v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: v_max_f32_e32 v1, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v2, v3, v4 +; SI-NEXT: v_max_f32_e32 v0, v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -837,25 +819,17 @@ define amdgpu_kernel void @maxnum_v4f16( ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: s_lshr_b32 s6, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_max_f32_e32 v3, v3, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, v1, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, v2, v5 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_max_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v1, v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -986,20 +960,16 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index b7370ce..f934a2d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -32,8 +32,6 @@ define amdgpu_kernel void @minnum_f16_ieee( ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -197,7 +195,6 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -305,7 +302,6 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -409,21 +405,17 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_min_f32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -556,20 +548,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -647,20 +637,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -745,27 +733,21 @@ define amdgpu_kernel void @minnum_v3f16( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: s_lshr_b32 s3, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: s_lshr_b32 s8, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_min_f32_e32 v2, v3, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, v1, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_min_f32_e32 v0, v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: v_min_f32_e32 v1, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v2, v3, v4 +; SI-NEXT: v_min_f32_e32 v0, v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -893,25 +875,17 @@ define amdgpu_kernel void @minnum_v4f16( ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: s_lshr_b32 s6, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_min_f32_e32 v3, v3, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, v1, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_min_f32_e32 v2, v2, v5 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_min_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v1, v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -1041,20 +1015,16 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index fb3e79b..5b7f0e7 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -951,56 +951,70 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX1100: ; %bb.0: ; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v1, 0 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v6, 0 +; SDAG-GFX1100-NEXT: v_pk_max_f16 v2, v0, 0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v2, 1.0 op_sel_hi:[1,0] ; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, 0 +; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX906: ; %bb.0: ; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, 0 +; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7 ; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp ; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v1 +; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v2 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v3 +; SDAG-VI-NEXT: v_min_f16_e32 v1, 1.0, v1 ; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1139,63 +1153,80 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s } define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v7, 0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v7, 0 +; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_mov_b32_e32 v0, v6 -; GFX906-NEXT: v_mov_b32_e32 v1, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v7, 0 +; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-VI-NEXT: v_mac_f32_e32 v10, v7, v9 ; SDAG-VI-NEXT: v_mac_f32_e32 v11, v6, v8 -; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 ; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v1, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_max_f16_e32 v2, 0, v2 +; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v3 +; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0x3c00 +; SDAG-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_f16_e32 v3, 1.0, v3 +; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v2 ; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-VI-NEXT: v_or_b32_e32 v1, v3, v1 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] @@ -1241,6 +1272,40 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, v2 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/Hexagon/addrmode-immop.mir b/llvm/test/CodeGen/Hexagon/addrmode-immop.mir index 3069cbe..1412d31 100644 --- a/llvm/test/CodeGen/Hexagon/addrmode-immop.mir +++ b/llvm/test/CodeGen/Hexagon/addrmode-immop.mir @@ -15,7 +15,7 @@ ; Function Attrs: norecurse define void @f0() #0 { b0: - %v0 = load ptr, ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, inrange i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1)), align 4 + %v0 = load ptr, ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1)), align 4 %v1 = call i32 %v0(ptr nonnull undef) unreachable } @@ -33,7 +33,7 @@ tracksRegLiveness: true body: | bb.0.b0: $r2 = A2_tfrsi @g0 + 12 - $r2 = L2_loadri_io killed $r2, @f1 - 1 :: (load (s32) from `ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, inrange i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1))`) + $r2 = L2_loadri_io killed $r2, @f1 - 1 :: (load (s32) from `ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1))`) ADJCALLSTACKDOWN 0, 0, implicit-def $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29 PS_callr_nr killed $r2, hexagoncsr, implicit undef $r0, implicit-def $r29, implicit-def dead $r0 ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29 diff --git a/llvm/test/CodeGen/NVPTX/b52037.ll b/llvm/test/CodeGen/NVPTX/b52037.ll index d9322da..5d1c390 100644 --- a/llvm/test/CodeGen/NVPTX/b52037.ll +++ b/llvm/test/CodeGen/NVPTX/b52037.ll @@ -47,7 +47,7 @@ bb: %tmp5 = load ptr, ptr %tmp4, align 8 %tmp9 = getelementptr inbounds %struct.zot, ptr %tmp, i64 0, i32 2, i32 1 store ptr %tmp5, ptr %tmp9, align 8 - store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @global_1, i64 0, inrange i32 0, i64 3), ptr %tmp, align 16 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @global_1, i64 0, i32 0, i64 3), ptr %tmp, align 16 %tmp.i1 = tail call i64 @foo() %tmp44.i16 = getelementptr inbounds i16, ptr %tmp5, i64 undef %tmp45.i17 = load i16, ptr %tmp44.i16, align 2 diff --git a/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir b/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir index 3a312d2..f3ef95b 100644 --- a/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir +++ b/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir @@ -130,7 +130,7 @@ body: | %22:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @c %10:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @e %13:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a - %14:g8rc_and_g8rc_nox0 = ADDItocL killed %13, @a, implicit $x2 + %14:g8rc_and_g8rc_nox0 = ADDItocL8 killed %13, @a, implicit $x2 bb.2.while.body: successors: %bb.4(0x30000000), %bb.3(0x50000000) diff --git a/llvm/test/CodeGen/PowerPC/toc-data-large-array.ll b/llvm/test/CodeGen/PowerPC/toc-data-large-array.ll new file mode 100644 index 0000000..90f40d9 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/toc-data-large-array.ll @@ -0,0 +1,16 @@ +; RUN: not --crash llc -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR +; RUN: not --crash llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR + +@a = global [5 x i16] zeroinitializer, align 2 #0 + +; Function Attrs: noinline +define i16 @foo() #1 { +entry: + %0 = load i16, ptr @a, align 2 + ret i16 %0 +} + +attributes #0 = { "toc-data" } +attributes #1 = { noinline } + +; CHECK-ERROR: LLVM ERROR: A GlobalVariable with size larger than a TOC entry is not currently supported by the toc data transformation. diff --git a/llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll b/llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll new file mode 100644 index 0000000..f870e99 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll @@ -0,0 +1,8 @@ +; RUN: not --crash llc -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR +; RUN: not --crash llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR + +@a = global [5 x i16] zeroinitializer, align 2 #0 + +attributes #0 = { "toc-data" } + +; CHECK-ERROR: LLVM ERROR: A GlobalVariable with size larger than a TOC entry is not currently supported by the toc data transformation. diff --git a/llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll b/llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll new file mode 100644 index 0000000..a5c9a8b --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll @@ -0,0 +1,110 @@ +; RUN: llc -mtriple powerpc-ibm-aix-xcoff < %s | FileCheck %s --check-prefix CHECK +; RUN: llc -mtriple powerpc64-ibm-aix-xcoff < %s | FileCheck %s --check-prefix CHECK + +; RUN: llc -filetype=obj -mtriple powerpc-ibm-aix-xcoff < %s -o %t32.o +; RUN: llvm-readobj %t32.o --syms | FileCheck %s --check-prefix=OBJ32 +; RUN: llc -filetype=obj -mtriple powerpc64-ibm-aix-xcoff < %s -o %t64.o +; RUN: llvm-readobj %t64.o --syms | FileCheck %s --check-prefix=OBJ64 + +%struct.small_struct = type { i16 } + +@a = global %struct.small_struct zeroinitializer, align 2 #0 +@b = global [2 x i16] zeroinitializer, align 2 #0 + +; Function Attrs: noinline +define i16 @foo() #1 { +entry: + %0 = load i16, ptr @a, align 2 + %1 = load i16, ptr @b, align 2 + %add = add nsw i16 %0, %1 + ret i16 %add +} + +attributes #0 = { "toc-data" } +attributes #1 = { noinline } + +; CHECK: .toc +; CHECK-NEXT: .csect a[TD],2 +; CHECK-NEXT: .globl a[TD] # @a +; CHECK-NEXT: .align 1 +; CHECK-NEXT: .space 2 +; CHECK-NEXT: .csect b[TD],2 +; CHECK-NEXT: .globl b[TD] # @b +; CHECK-NEXT: .align 1 +; CHECK-NEXT: .space 4 + +; OBJ32: Symbol { +; OBJ32: Name: a +; OBJ32-NEXT: Value (RelocatableAddress): 0x3C +; OBJ32-NEXT: Section: .data +; OBJ32-NEXT: Type: 0x0 +; OBJ32-NEXT: StorageClass: C_EXT (0x2) +; OBJ32-NEXT: NumberOfAuxEntries: 1 +; OBJ32-NEXT: CSECT Auxiliary Entry { +; OBJ32-NEXT: Index: {{[0-9]+}} +; OBJ32-NEXT: SectionLen: 2 +; OBJ32-NEXT: ParameterHashIndex: 0x0 +; OBJ32-NEXT: TypeChkSectNum: 0x0 +; OBJ32-NEXT: SymbolAlignmentLog2: 2 +; OBJ32-NEXT: SymbolType: XTY_SD (0x1) +; OBJ32-NEXT: StorageMappingClass: XMC_TD (0x10) +; OBJ32-NEXT: StabInfoIndex: 0x0 +; OBJ32-NEXT: StabSectNum: 0x0 +; OBJ32-NEXT: } +; OBJ32-NEXT: } +; OBJ32-NEXT: Symbol { +; OBJ32: Name: b +; OBJ32-NEXT: Value (RelocatableAddress): 0x40 +; OBJ32-NEXT: Section: .data +; OBJ32-NEXT: Type: 0x0 +; OBJ32-NEXT: StorageClass: C_EXT (0x2) +; OBJ32-NEXT: NumberOfAuxEntries: 1 +; OBJ32-NEXT: CSECT Auxiliary Entry { +; OBJ32-NEXT: Index: {{[0-9]+}} +; OBJ32-NEXT: SectionLen: 4 +; OBJ32-NEXT: ParameterHashIndex: 0x0 +; OBJ32-NEXT: TypeChkSectNum: 0x0 +; OBJ32-NEXT: SymbolAlignmentLog2: 2 +; OBJ32-NEXT: SymbolType: XTY_SD (0x1) +; OBJ32-NEXT: StorageMappingClass: XMC_TD (0x10) +; OBJ32-NEXT: StabInfoIndex: 0x0 +; OBJ32-NEXT: StabSectNum: 0x0 +; OBJ32-NEXT: } +; OBJ32-NEXT: } + +; OBJ64: Symbol { +; OBJ64: Name: a +; OBJ64-NEXT: Value (RelocatableAddress): 0x48 +; OBJ64-NEXT: Section: .data +; OBJ64-NEXT: Type: 0x0 +; OBJ64-NEXT: StorageClass: C_EXT (0x2) +; OBJ64-NEXT: NumberOfAuxEntries: 1 +; OBJ64-NEXT: CSECT Auxiliary Entry { +; OBJ64-NEXT: Index: {{[0-9]+}} +; OBJ64-NEXT: SectionLen: 2 +; OBJ64-NEXT: ParameterHashIndex: 0x0 +; OBJ64-NEXT: TypeChkSectNum: 0x0 +; OBJ64-NEXT: SymbolAlignmentLog2: 2 +; OBJ64-NEXT: SymbolType: XTY_SD (0x1) +; OBJ64-NEXT: StorageMappingClass: XMC_TD (0x10) +; OBJ64-NEXT: Auxiliary Type: AUX_CSECT (0xFB) +; OBJ64-NEXT: } +; OBJ64-NEXT: } +; OBJ64-NEXT: Symbol { +; OBJ64: Name: b +; OBJ64-NEXT: Value (RelocatableAddress): 0x4C +; OBJ64-NEXT: Section: .data +; OBJ64-NEXT: Type: 0x0 +; OBJ64-NEXT: StorageClass: C_EXT (0x2) +; OBJ64-NEXT: NumberOfAuxEntries: 1 +; OBJ64-NEXT: CSECT Auxiliary Entry { +; OBJ64-NEXT: Index: {{[0-9]+}} +; OBJ64-NEXT: SectionLen: 4 +; OBJ64-NEXT: ParameterHashIndex: 0x0 +; OBJ64-NEXT: TypeChkSectNum: 0x0 +; OBJ64-NEXT: SymbolAlignmentLog2: 2 +; OBJ64-NEXT: SymbolType: XTY_SD (0x1) +; OBJ64-NEXT: StorageMappingClass: XMC_TD (0x10) +; OBJ64-NEXT: Auxiliary Type: AUX_CSECT (0xFB) +; OBJ64-NEXT: } +; OBJ64-NEXT: } diff --git a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll new file mode 100644 index 0000000..673008d --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll @@ -0,0 +1,871 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -verify-machineinstrs -mtriple=riscv64 -mattr=+v,+d,+m,+zbb %s -o - | FileCheck %s --check-prefix=RV64 +; RUN: llc -verify-machineinstrs -mtriple=riscv32 -mattr=+v,+d,+m,+zbb %s -o - | FileCheck %s --check-prefix=RV32 + +; Compress + store for i8 type + +define void @test_compresstore_v1i8(ptr %p, <1 x i1> %mask, <1 x i8> %data) { +; RV64-LABEL: test_compresstore_v1i8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; RV64-NEXT: vse8.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v1i8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; RV32-NEXT: vse8.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v1i8(<1 x i8> %data, ptr align 1 %p, <1 x i1> %mask) + ret void +} + +define void @test_compresstore_v2i8(ptr %p, <2 x i1> %mask, <2 x i8> %data) { +; RV64-LABEL: test_compresstore_v2i8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; RV64-NEXT: vse8.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v2i8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; RV32-NEXT: vse8.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v2i8(<2 x i8> %data, ptr align 1 %p, <2 x i1> %mask) + ret void +} + +define void @test_compresstore_v4i8(ptr %p, <4 x i1> %mask, <4 x i8> %data) { +; RV64-LABEL: test_compresstore_v4i8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vse8.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v4i8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vse8.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v4i8(<4 x i8> %data, ptr align 1 %p, <4 x i1> %mask) + ret void +} + +define void @test_compresstore_v8i8(ptr %p, <8 x i1> %mask, <8 x i8> %data) { +; RV64-LABEL: test_compresstore_v8i8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vse8.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v8i8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vse8.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v8i8(<8 x i8> %data, ptr align 1 %p, <8 x i1> %mask) + ret void +} + +define void @test_compresstore_v16i8(ptr %p, <16 x i1> %mask, <16 x i8> %data) { +; RV64-LABEL: test_compresstore_v16i8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; RV64-NEXT: vse8.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v16i8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; RV32-NEXT: vse8.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v16i8(<16 x i8> %data, ptr align 1 %p, <16 x i1> %mask) + ret void +} + +define void @test_compresstore_v32i8(ptr %p, <32 x i1> %mask, <32 x i8> %data) { +; RV64-LABEL: test_compresstore_v32i8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RV64-NEXT: vcompress.vm v10, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RV64-NEXT: vse8.v v10, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v32i8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RV32-NEXT: vcompress.vm v10, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RV32-NEXT: vse8.v v10, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v32i8(<32 x i8> %data, ptr align 1 %p, <32 x i1> %mask) + ret void +} + +define void @test_compresstore_v64i8(ptr %p, <64 x i1> %mask, <64 x i8> %data) { +; RV64-LABEL: test_compresstore_v64i8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a1, 64 +; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV64-NEXT: vcompress.vm v12, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV64-NEXT: vse8.v v12, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v64i8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a1, 64 +; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV32-NEXT: vcompress.vm v12, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RV32-NEXT: vse8.v v12, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v64i8(<64 x i8> %data, ptr align 1 %p, <64 x i1> %mask) + ret void +} + +define void @test_compresstore_v128i8(ptr %p, <128 x i1> %mask, <128 x i8> %data) { +; RV64-LABEL: test_compresstore_v128i8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a1, 128 +; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; RV64-NEXT: vcompress.vm v16, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; RV64-NEXT: vse8.v v16, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v128i8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a1, 128 +; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; RV32-NEXT: vcompress.vm v16, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; RV32-NEXT: vse8.v v16, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v128i8(<128 x i8> %data, ptr align 1 %p, <128 x i1> %mask) + ret void +} + +define void @test_compresstore_v256i8(ptr %p, <256 x i1> %mask, <256 x i8> %data) { +; RV64-LABEL: test_compresstore_v256i8: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vmv1r.v v7, v8 +; RV64-NEXT: li a2, 128 +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV64-NEXT: vle8.v v24, (a1) +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vslidedown.vi v9, v0, 1 +; RV64-NEXT: vmv.x.s a1, v9 +; RV64-NEXT: vmv.x.s a3, v0 +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV64-NEXT: vcompress.vm v8, v16, v0 +; RV64-NEXT: vcpop.m a4, v0 +; RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV64-NEXT: vcompress.vm v8, v24, v7 +; RV64-NEXT: vcpop.m a2, v7 +; RV64-NEXT: cpop a3, a3 +; RV64-NEXT: cpop a1, a1 +; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v256i8: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vmv1r.v v7, v8 +; RV32-NEXT: li a2, 128 +; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV32-NEXT: vle8.v v24, (a1) +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vslidedown.vi v9, v0, 1 +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsrl.vx v10, v9, a1 +; RV32-NEXT: vmv.x.s a3, v10 +; RV32-NEXT: vsrl.vx v10, v0, a1 +; RV32-NEXT: vmv.x.s a1, v10 +; RV32-NEXT: vmv.x.s a4, v9 +; RV32-NEXT: vmv.x.s a5, v0 +; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV32-NEXT: vcompress.vm v8, v16, v0 +; RV32-NEXT: vcpop.m a6, v0 +; RV32-NEXT: vsetvli zero, a6, e8, m8, ta, ma +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: cpop a1, a1 +; RV32-NEXT: cpop a5, a5 +; RV32-NEXT: add a1, a5, a1 +; RV32-NEXT: cpop a3, a3 +; RV32-NEXT: cpop a4, a4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: add a1, a1, a3 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; RV32-NEXT: vcompress.vm v8, v24, v7 +; RV32-NEXT: vcpop.m a1, v7 +; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v256i8(<256 x i8> %data, ptr align 1 %p, <256 x i1> %mask) + ret void +} + +; Compress + store for i16 type + +define void @test_compresstore_v1i16(ptr %p, <1 x i1> %mask, <1 x i16> %data) { +; RV64-LABEL: test_compresstore_v1i16: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vse16.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v1i16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vse16.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v1i16(<1 x i16> %data, ptr align 2 %p, <1 x i1> %mask) + ret void +} + +define void @test_compresstore_v2i16(ptr %p, <2 x i1> %mask, <2 x i16> %data) { +; RV64-LABEL: test_compresstore_v2i16: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vse16.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v2i16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vse16.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v2i16(<2 x i16> %data, ptr align 2 %p, <2 x i1> %mask) + ret void +} + +define void @test_compresstore_v4i16(ptr %p, <4 x i1> %mask, <4 x i16> %data) { +; RV64-LABEL: test_compresstore_v4i16: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vse16.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v4i16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vse16.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v4i16(<4 x i16> %data, ptr align 2 %p, <4 x i1> %mask) + ret void +} + +define void @test_compresstore_v8i16(ptr %p, <8 x i1> %mask, <8 x i16> %data) { +; RV64-LABEL: test_compresstore_v8i16: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; RV64-NEXT: vse16.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v8i16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; RV32-NEXT: vse16.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v8i16(<8 x i16> %data, ptr align 2 %p, <8 x i1> %mask) + ret void +} + +define void @test_compresstore_v16i16(ptr %p, <16 x i1> %mask, <16 x i16> %data) { +; RV64-LABEL: test_compresstore_v16i16: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64-NEXT: vcompress.vm v10, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; RV64-NEXT: vse16.v v10, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v16i16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV32-NEXT: vcompress.vm v10, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; RV32-NEXT: vse16.v v10, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v16i16(<16 x i16> %data, ptr align 2 %p, <16 x i1> %mask) + ret void +} + +define void @test_compresstore_v32i16(ptr %p, <32 x i1> %mask, <32 x i16> %data) { +; RV64-LABEL: test_compresstore_v32i16: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; RV64-NEXT: vcompress.vm v12, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; RV64-NEXT: vse16.v v12, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v32i16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; RV32-NEXT: vcompress.vm v12, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; RV32-NEXT: vse16.v v12, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v32i16(<32 x i16> %data, ptr align 2 %p, <32 x i1> %mask) + ret void +} + +define void @test_compresstore_v64i16(ptr %p, <64 x i1> %mask, <64 x i16> %data) { +; RV64-LABEL: test_compresstore_v64i16: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a1, 64 +; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV64-NEXT: vcompress.vm v16, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV64-NEXT: vse16.v v16, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v64i16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a1, 64 +; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV32-NEXT: vcompress.vm v16, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV32-NEXT: vse16.v v16, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v64i16(<64 x i16> %data, ptr align 2 %p, <64 x i1> %mask) + ret void +} + +define void @test_compresstore_v128i16(ptr %p, <128 x i1> %mask, <128 x i16> %data) { +; RV64-LABEL: test_compresstore_v128i16: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a1, 64 +; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV64-NEXT: vcompress.vm v24, v8, v0 +; RV64-NEXT: vcpop.m a2, v0 +; RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV64-NEXT: vse16.v v24, (a0) +; RV64-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; RV64-NEXT: vslidedown.vi v8, v0, 8 +; RV64-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV64-NEXT: vcompress.vm v24, v16, v8 +; RV64-NEXT: vcpop.m a2, v8 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmv.x.s a1, v0 +; RV64-NEXT: cpop a1, a1 +; RV64-NEXT: slli a1, a1, 1 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV64-NEXT: vse16.v v24, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v128i16: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a1, 64 +; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV32-NEXT: vcompress.vm v24, v8, v0 +; RV32-NEXT: vcpop.m a2, v0 +; RV32-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV32-NEXT: vse16.v v24, (a0) +; RV32-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; RV32-NEXT: vslidedown.vi v24, v0, 8 +; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV32-NEXT: vcompress.vm v8, v16, v24 +; RV32-NEXT: vcpop.m a1, v24 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vsrl.vx v16, v0, a2 +; RV32-NEXT: vmv.x.s a2, v16 +; RV32-NEXT: cpop a2, a2 +; RV32-NEXT: vmv.x.s a3, v0 +; RV32-NEXT: cpop a3, a3 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV32-NEXT: vse16.v v8, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v128i16(<128 x i16> %data, ptr align 2 %p, <128 x i1> %mask) + ret void +} + +; Compress + store for i32 type + +define void @test_compresstore_v1i32(ptr %p, <1 x i1> %mask, <1 x i32> %data) { +; RV64-LABEL: test_compresstore_v1i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vse32.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v1i32: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vse32.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v1i32(<1 x i32> %data, ptr align 4 %p, <1 x i1> %mask) + ret void +} + +define void @test_compresstore_v2i32(ptr %p, <2 x i1> %mask, <2 x i32> %data) { +; RV64-LABEL: test_compresstore_v2i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vse32.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v2i32: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vse32.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v2i32(<2 x i32> %data, ptr align 4 %p, <2 x i1> %mask) + ret void +} + +define void @test_compresstore_v4i32(ptr %p, <4 x i1> %mask, <4 x i32> %data) { +; RV64-LABEL: test_compresstore_v4i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vse32.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v4i32: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vse32.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %data, ptr align 4 %p, <4 x i1> %mask) + ret void +} + +define void @test_compresstore_v8i32(ptr %p, <8 x i1> %mask, <8 x i32> %data) { +; RV64-LABEL: test_compresstore_v8i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v10, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV64-NEXT: vse32.v v10, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v8i32: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vcompress.vm v10, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV32-NEXT: vse32.v v10, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %data, ptr align 4 %p, <8 x i1> %mask) + ret void +} + +define void @test_compresstore_v16i32(ptr %p, <16 x i1> %mask, <16 x i32> %data) { +; RV64-LABEL: test_compresstore_v16i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV64-NEXT: vcompress.vm v12, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vse32.v v12, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v16i32: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vcompress.vm v12, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vse32.v v12, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %data, ptr align 4 %p, <16 x i1> %mask) + ret void +} + +define void @test_compresstore_v32i32(ptr %p, <32 x i1> %mask, <32 x i32> %data) { +; RV64-LABEL: test_compresstore_v32i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vcompress.vm v16, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vse32.v v16, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v32i32: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vcompress.vm v16, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vse32.v v16, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v32i32(<32 x i32> %data, ptr align 4 %p, <32 x i1> %mask) + ret void +} + +define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data) { +; RV64-LABEL: test_compresstore_v64i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vcompress.vm v24, v8, v0 +; RV64-NEXT: vcpop.m a2, v0 +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV64-NEXT: vse32.v v24, (a0) +; RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64-NEXT: vslidedown.vi v8, v0, 4 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vcompress.vm v24, v16, v8 +; RV64-NEXT: vcpop.m a1, v8 +; RV64-NEXT: vmv.x.s a2, v0 +; RV64-NEXT: cpopw a2, a2 +; RV64-NEXT: slli a2, a2, 2 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vse32.v v24, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v64i32: +; RV32: # %bb.0: # %entry +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vcompress.vm v24, v8, v0 +; RV32-NEXT: vcpop.m a2, v0 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vse32.v v24, (a0) +; RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV32-NEXT: vslidedown.vi v8, v0, 4 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vcompress.vm v24, v16, v8 +; RV32-NEXT: vcpop.m a1, v8 +; RV32-NEXT: vmv.x.s a2, v0 +; RV32-NEXT: cpop a2, a2 +; RV32-NEXT: slli a2, a2, 2 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vse32.v v24, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v64i32(<64 x i32> %data, ptr align 4 %p, <64 x i1> %mask) + ret void +} + +; Compress + store for i64 type + +define void @test_compresstore_v1i64(ptr %p, <1 x i1> %mask, <1 x i64> %data) { +; RV64-LABEL: test_compresstore_v1i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vse64.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v1i64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV32-NEXT: vse64.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v1i64(<1 x i64> %data, ptr align 8 %p, <1 x i1> %mask) + ret void +} + +define void @test_compresstore_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) { +; RV64-LABEL: test_compresstore_v2i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vse64.v v9, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v2i64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV32-NEXT: vse64.v v9, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %data, ptr align 8 %p, <2 x i1> %mask) + ret void +} + +define void @test_compresstore_v4i64(ptr %p, <4 x i1> %mask, <4 x i64> %data) { +; RV64-LABEL: test_compresstore_v4i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vcompress.vm v10, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vse64.v v10, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v4i64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vcompress.vm v10, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV32-NEXT: vse64.v v10, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %data, ptr align 8 %p, <4 x i1> %mask) + ret void +} + +define void @test_compresstore_v8i64(ptr %p, <8 x i1> %mask, <8 x i64> %data) { +; RV64-LABEL: test_compresstore_v8i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vcompress.vm v12, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vse64.v v12, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v8i64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vcompress.vm v12, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV32-NEXT: vse64.v v12, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %data, ptr align 8 %p, <8 x i1> %mask) + ret void +} + +define void @test_compresstore_v16i64(ptr %p, <16 x i1> %mask, <16 x i64> %data) { +; RV64-LABEL: test_compresstore_v16i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vcompress.vm v16, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vse64.v v16, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v16i64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vcompress.vm v16, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vse64.v v16, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v16i64(<16 x i64> %data, ptr align 8 %p, <16 x i1> %mask) + ret void +} + +define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data) { +; RV64-LABEL: test_compresstore_v32i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vcompress.vm v24, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vse64.v v24, (a0) +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vslidedown.vi v24, v0, 2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vcompress.vm v8, v16, v24 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vmv.x.s a1, v0 +; RV64-NEXT: zext.h a1, a1 +; RV64-NEXT: cpopw a1, a1 +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vcpop.m a1, v24 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: ret +; +; RV32-LABEL: test_compresstore_v32i64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vcompress.vm v24, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vse64.v v24, (a0) +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v24, v0, 2 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vcompress.vm v8, v16, v24 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vmv.x.s a1, v0 +; RV32-NEXT: zext.h a1, a1 +; RV32-NEXT: cpop a1, a1 +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vcpop.m a1, v24 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: ret +entry: + tail call void @llvm.masked.compressstore.v32i64(<32 x i64> %data, ptr align 8 %p, <32 x i1> %mask) + ret void +} + +declare void @llvm.masked.compressstore.v1i8(<1 x i8>, ptr, <1 x i1>) +declare void @llvm.masked.compressstore.v2i8(<2 x i8>, ptr, <2 x i1>) +declare void @llvm.masked.compressstore.v4i8(<4 x i8>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>) +declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>) +declare void @llvm.masked.compressstore.v128i8(<128 x i8>, ptr, <128 x i1>) +declare void @llvm.masked.compressstore.v256i8(<256 x i8>, ptr, <256 x i1>) + +declare void @llvm.masked.compressstore.v1i16(<1 x i16>, ptr, <1 x i1>) +declare void @llvm.masked.compressstore.v2i16(<2 x i16>, ptr, <2 x i1>) +declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>) +declare void @llvm.masked.compressstore.v64i16(<64 x i16>, ptr, <64 x i1>) +declare void @llvm.masked.compressstore.v128i16(<128 x i16>, ptr, <128 x i1>) + +declare void @llvm.masked.compressstore.v1i32(<1 x i32>, ptr, <1 x i1>) +declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>) +declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v32i32(<32 x i32>, ptr, <32 x i1>) +declare void @llvm.masked.compressstore.v64i32(<64 x i32>, ptr, <64 x i1>) + +declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>) +declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>) +declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>) +declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>) +declare void @llvm.masked.compressstore.v16i64(<16 x i64>, ptr, <16 x i1>) +declare void @llvm.masked.compressstore.v32i64(<32 x i64>, ptr, <32 x i1>) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll index 52c5292..36fbdd8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll @@ -6,24 +6,20 @@ declare void @llvm.masked.compressstore.v1f16(<1 x half>, ptr, <1 x i1>) define void @compressstore_v1f16(ptr %base, <1 x half> %v, <1 x i1> %mask) { ; RV32-LABEL: compressstore_v1f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vfirst.m a1, v0 -; RV32-NEXT: bnez a1, .LBB0_2 -; RV32-NEXT: # %bb.1: # %cond.store ; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vse16.v v8, (a0) -; RV32-NEXT: .LBB0_2: # %else +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vse16.v v9, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v1f16: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vfirst.m a1, v0 -; RV64-NEXT: bnez a1, .LBB0_2 -; RV64-NEXT: # %bb.1: # %cond.store ; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV64-NEXT: vse16.v v8, (a0) -; RV64-NEXT: .LBB0_2: # %else +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vse16.v v9, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v1f16(<1 x half> %v, ptr align 2 %base, <1 x i1> %mask) ret void @@ -33,48 +29,20 @@ declare void @llvm.masked.compressstore.v2f16(<2 x half>, ptr, <2 x i1>) define void @compressstore_v2f16(ptr %base, <2 x half> %v, <2 x i1> %mask) { ; RV32-LABEL: compressstore_v2f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB1_3 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: bnez a1, .LBB1_4 -; RV32-NEXT: .LBB1_2: # %else2 -; RV32-NEXT: ret -; RV32-NEXT: .LBB1_3: # %cond.store -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vse16.v v8, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: beqz a1, .LBB1_2 -; RV32-NEXT: .LBB1_4: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 1 -; RV32-NEXT: vse16.v v8, (a0) +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vse16.v v9, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v2f16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB1_3 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: bnez a1, .LBB1_4 -; RV64-NEXT: .LBB1_2: # %else2 -; RV64-NEXT: ret -; RV64-NEXT: .LBB1_3: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV64-NEXT: vse16.v v8, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: beqz a1, .LBB1_2 -; RV64-NEXT: .LBB1_4: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 1 -; RV64-NEXT: vse16.v v8, (a0) +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vse16.v v9, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v2f16(<2 x half> %v, ptr align 2 %base, <2 x i1> %mask) ret void @@ -84,88 +52,20 @@ declare void @llvm.masked.compressstore.v4f16(<4 x half>, ptr, <4 x i1>) define void @compressstore_v4f16(ptr %base, <4 x half> %v, <4 x i1> %mask) { ; RV32-LABEL: compressstore_v4f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB2_5 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB2_6 -; RV32-NEXT: .LBB2_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB2_7 -; RV32-NEXT: .LBB2_3: # %else5 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: bnez a1, .LBB2_8 -; RV32-NEXT: .LBB2_4: # %else8 -; RV32-NEXT: ret -; RV32-NEXT: .LBB2_5: # %cond.store -; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV32-NEXT: vse16.v v8, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB2_2 -; RV32-NEXT: .LBB2_6: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 1 -; RV32-NEXT: vse16.v v9, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB2_3 -; RV32-NEXT: .LBB2_7: # %cond.store4 -; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma ; RV32-NEXT: vse16.v v9, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: beqz a1, .LBB2_4 -; RV32-NEXT: .LBB2_8: # %cond.store7 -; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 3 -; RV32-NEXT: vse16.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v4f16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB2_5 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB2_6 -; RV64-NEXT: .LBB2_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB2_7 -; RV64-NEXT: .LBB2_3: # %else5 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: bnez a1, .LBB2_8 -; RV64-NEXT: .LBB2_4: # %else8 -; RV64-NEXT: ret -; RV64-NEXT: .LBB2_5: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64-NEXT: vse16.v v8, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB2_2 -; RV64-NEXT: .LBB2_6: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-NEXT: vse16.v v9, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB2_3 -; RV64-NEXT: .LBB2_7: # %cond.store4 -; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma ; RV64-NEXT: vse16.v v9, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: beqz a1, .LBB2_4 -; RV64-NEXT: .LBB2_8: # %cond.store7 -; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 3 -; RV64-NEXT: vse16.v v8, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v4f16(<4 x half> %v, ptr align 2 %base, <4 x i1> %mask) ret void @@ -175,168 +75,20 @@ declare void @llvm.masked.compressstore.v8f16(<8 x half>, ptr, <8 x i1>) define void @compressstore_v8f16(ptr %base, <8 x half> %v, <8 x i1> %mask) { ; RV32-LABEL: compressstore_v8f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB3_9 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB3_10 -; RV32-NEXT: .LBB3_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB3_11 -; RV32-NEXT: .LBB3_3: # %else5 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: bnez a2, .LBB3_12 -; RV32-NEXT: .LBB3_4: # %else8 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: bnez a2, .LBB3_13 -; RV32-NEXT: .LBB3_5: # %else11 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: bnez a2, .LBB3_14 -; RV32-NEXT: .LBB3_6: # %else14 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: bnez a2, .LBB3_15 -; RV32-NEXT: .LBB3_7: # %else17 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: bnez a1, .LBB3_16 -; RV32-NEXT: .LBB3_8: # %else20 -; RV32-NEXT: ret -; RV32-NEXT: .LBB3_9: # %cond.store -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vse16.v v8, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB3_2 -; RV32-NEXT: .LBB3_10: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 1 -; RV32-NEXT: vse16.v v9, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB3_3 -; RV32-NEXT: .LBB3_11: # %cond.store4 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 2 -; RV32-NEXT: vse16.v v9, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: beqz a2, .LBB3_4 -; RV32-NEXT: .LBB3_12: # %cond.store7 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 3 -; RV32-NEXT: vse16.v v9, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: beqz a2, .LBB3_5 -; RV32-NEXT: .LBB3_13: # %cond.store10 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 4 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; RV32-NEXT: vse16.v v9, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: beqz a2, .LBB3_6 -; RV32-NEXT: .LBB3_14: # %cond.store13 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 5 -; RV32-NEXT: vse16.v v9, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: beqz a2, .LBB3_7 -; RV32-NEXT: .LBB3_15: # %cond.store16 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 6 -; RV32-NEXT: vse16.v v9, (a0) -; RV32-NEXT: addi a0, a0, 2 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: beqz a1, .LBB3_8 -; RV32-NEXT: .LBB3_16: # %cond.store19 -; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 7 -; RV32-NEXT: vse16.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v8f16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB3_9 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB3_10 -; RV64-NEXT: .LBB3_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB3_11 -; RV64-NEXT: .LBB3_3: # %else5 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: bnez a2, .LBB3_12 -; RV64-NEXT: .LBB3_4: # %else8 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: bnez a2, .LBB3_13 -; RV64-NEXT: .LBB3_5: # %else11 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: bnez a2, .LBB3_14 -; RV64-NEXT: .LBB3_6: # %else14 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: bnez a2, .LBB3_15 -; RV64-NEXT: .LBB3_7: # %else17 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: bnez a1, .LBB3_16 -; RV64-NEXT: .LBB3_8: # %else20 -; RV64-NEXT: ret -; RV64-NEXT: .LBB3_9: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vse16.v v8, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB3_2 -; RV64-NEXT: .LBB3_10: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-NEXT: vse16.v v9, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB3_3 -; RV64-NEXT: .LBB3_11: # %cond.store4 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 2 -; RV64-NEXT: vse16.v v9, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: beqz a2, .LBB3_4 -; RV64-NEXT: .LBB3_12: # %cond.store7 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 3 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; RV64-NEXT: vse16.v v9, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: beqz a2, .LBB3_5 -; RV64-NEXT: .LBB3_13: # %cond.store10 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 4 -; RV64-NEXT: vse16.v v9, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: beqz a2, .LBB3_6 -; RV64-NEXT: .LBB3_14: # %cond.store13 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 5 -; RV64-NEXT: vse16.v v9, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: beqz a2, .LBB3_7 -; RV64-NEXT: .LBB3_15: # %cond.store16 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 6 -; RV64-NEXT: vse16.v v9, (a0) -; RV64-NEXT: addi a0, a0, 2 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: beqz a1, .LBB3_8 -; RV64-NEXT: .LBB3_16: # %cond.store19 -; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 7 -; RV64-NEXT: vse16.v v8, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v8f16(<8 x half> %v, ptr align 2 %base, <8 x i1> %mask) ret void @@ -346,24 +98,20 @@ declare void @llvm.masked.compressstore.v1f32(<1 x float>, ptr, <1 x i1>) define void @compressstore_v1f32(ptr %base, <1 x float> %v, <1 x i1> %mask) { ; RV32-LABEL: compressstore_v1f32: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vfirst.m a1, v0 -; RV32-NEXT: bnez a1, .LBB4_2 -; RV32-NEXT: # %bb.1: # %cond.store ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: .LBB4_2: # %else +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vse32.v v9, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v1f32: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vfirst.m a1, v0 -; RV64-NEXT: bnez a1, .LBB4_2 -; RV64-NEXT: # %bb.1: # %cond.store ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: .LBB4_2: # %else +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vse32.v v9, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v1f32(<1 x float> %v, ptr align 4 %base, <1 x i1> %mask) ret void @@ -373,48 +121,20 @@ declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>) define void @compressstore_v2f32(ptr %base, <2 x float> %v, <2 x i1> %mask) { ; RV32-LABEL: compressstore_v2f32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB5_3 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: bnez a1, .LBB5_4 -; RV32-NEXT: .LBB5_2: # %else2 -; RV32-NEXT: ret -; RV32-NEXT: .LBB5_3: # %cond.store -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: beqz a1, .LBB5_2 -; RV32-NEXT: .LBB5_4: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 1 -; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vse32.v v9, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v2f32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB5_3 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: bnez a1, .LBB5_4 -; RV64-NEXT: .LBB5_2: # %else2 -; RV64-NEXT: ret -; RV64-NEXT: .LBB5_3: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: beqz a1, .LBB5_2 -; RV64-NEXT: .LBB5_4: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 1 -; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vse32.v v9, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v2f32(<2 x float> %v, ptr align 4 %base, <2 x i1> %mask) ret void @@ -424,88 +144,20 @@ declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>) define void @compressstore_v4f32(ptr %base, <4 x float> %v, <4 x i1> %mask) { ; RV32-LABEL: compressstore_v4f32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB6_5 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB6_6 -; RV32-NEXT: .LBB6_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB6_7 -; RV32-NEXT: .LBB6_3: # %else5 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: bnez a1, .LBB6_8 -; RV32-NEXT: .LBB6_4: # %else8 -; RV32-NEXT: ret -; RV32-NEXT: .LBB6_5: # %cond.store -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB6_2 -; RV32-NEXT: .LBB6_6: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 1 -; RV32-NEXT: vse32.v v9, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB6_3 -; RV32-NEXT: .LBB6_7: # %cond.store4 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vse32.v v9, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: beqz a1, .LBB6_4 -; RV32-NEXT: .LBB6_8: # %cond.store7 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 3 -; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v4f32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB6_5 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB6_6 -; RV64-NEXT: .LBB6_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB6_7 -; RV64-NEXT: .LBB6_3: # %else5 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: bnez a1, .LBB6_8 -; RV64-NEXT: .LBB6_4: # %else8 -; RV64-NEXT: ret -; RV64-NEXT: .LBB6_5: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB6_2 -; RV64-NEXT: .LBB6_6: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 1 -; RV64-NEXT: vse32.v v9, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB6_3 -; RV64-NEXT: .LBB6_7: # %cond.store4 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vse32.v v9, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: beqz a1, .LBB6_4 -; RV64-NEXT: .LBB6_8: # %cond.store7 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 3 -; RV64-NEXT: vse32.v v8, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v4f32(<4 x float> %v, ptr align 4 %base, <4 x i1> %mask) ret void @@ -515,176 +167,20 @@ declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>) define void @compressstore_v8f32(ptr %base, <8 x float> %v, <8 x i1> %mask) { ; RV32-LABEL: compressstore_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB7_9 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB7_10 -; RV32-NEXT: .LBB7_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB7_11 -; RV32-NEXT: .LBB7_3: # %else5 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: bnez a2, .LBB7_12 -; RV32-NEXT: .LBB7_4: # %else8 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: bnez a2, .LBB7_13 -; RV32-NEXT: .LBB7_5: # %else11 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: bnez a2, .LBB7_14 -; RV32-NEXT: .LBB7_6: # %else14 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: bnez a2, .LBB7_15 -; RV32-NEXT: .LBB7_7: # %else17 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: bnez a1, .LBB7_16 -; RV32-NEXT: .LBB7_8: # %else20 -; RV32-NEXT: ret -; RV32-NEXT: .LBB7_9: # %cond.store -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB7_2 -; RV32-NEXT: .LBB7_10: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 1 -; RV32-NEXT: vse32.v v10, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB7_3 -; RV32-NEXT: .LBB7_11: # %cond.store4 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 2 -; RV32-NEXT: vse32.v v10, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: beqz a2, .LBB7_4 -; RV32-NEXT: .LBB7_12: # %cond.store7 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 3 -; RV32-NEXT: vse32.v v10, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: beqz a2, .LBB7_5 -; RV32-NEXT: .LBB7_13: # %cond.store10 -; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 4 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vse32.v v10, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: beqz a2, .LBB7_6 -; RV32-NEXT: .LBB7_14: # %cond.store13 -; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 5 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vse32.v v10, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: beqz a2, .LBB7_7 -; RV32-NEXT: .LBB7_15: # %cond.store16 -; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 6 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vcompress.vm v10, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV32-NEXT: vse32.v v10, (a0) -; RV32-NEXT: addi a0, a0, 4 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: beqz a1, .LBB7_8 -; RV32-NEXT: .LBB7_16: # %cond.store19 -; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 7 -; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v8f32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB7_9 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB7_10 -; RV64-NEXT: .LBB7_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB7_11 -; RV64-NEXT: .LBB7_3: # %else5 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: bnez a2, .LBB7_12 -; RV64-NEXT: .LBB7_4: # %else8 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: bnez a2, .LBB7_13 -; RV64-NEXT: .LBB7_5: # %else11 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: bnez a2, .LBB7_14 -; RV64-NEXT: .LBB7_6: # %else14 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: bnez a2, .LBB7_15 -; RV64-NEXT: .LBB7_7: # %else17 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: bnez a1, .LBB7_16 -; RV64-NEXT: .LBB7_8: # %else20 -; RV64-NEXT: ret -; RV64-NEXT: .LBB7_9: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB7_2 -; RV64-NEXT: .LBB7_10: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-NEXT: vse32.v v10, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB7_3 -; RV64-NEXT: .LBB7_11: # %cond.store4 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 2 -; RV64-NEXT: vse32.v v10, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: beqz a2, .LBB7_4 -; RV64-NEXT: .LBB7_12: # %cond.store7 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-NEXT: vse32.v v10, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: beqz a2, .LBB7_5 -; RV64-NEXT: .LBB7_13: # %cond.store10 -; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 4 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vse32.v v10, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: beqz a2, .LBB7_6 -; RV64-NEXT: .LBB7_14: # %cond.store13 -; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 5 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vse32.v v10, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: beqz a2, .LBB7_7 -; RV64-NEXT: .LBB7_15: # %cond.store16 -; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 6 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v10, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vse32.v v10, (a0) -; RV64-NEXT: addi a0, a0, 4 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: beqz a1, .LBB7_8 -; RV64-NEXT: .LBB7_16: # %cond.store19 -; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 7 -; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vse32.v v8, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v8f32(<8 x float> %v, ptr align 4 %base, <8 x i1> %mask) ret void @@ -694,24 +190,20 @@ declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>) define void @compressstore_v1f64(ptr %base, <1 x double> %v, <1 x i1> %mask) { ; RV32-LABEL: compressstore_v1f64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vfirst.m a1, v0 -; RV32-NEXT: bnez a1, .LBB8_2 -; RV32-NEXT: # %bb.1: # %cond.store ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vse64.v v8, (a0) -; RV32-NEXT: .LBB8_2: # %else +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV32-NEXT: vse64.v v9, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v1f64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vfirst.m a1, v0 -; RV64-NEXT: bnez a1, .LBB8_2 -; RV64-NEXT: # %bb.1: # %cond.store ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: .LBB8_2: # %else +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vse64.v v9, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v1f64(<1 x double> %v, ptr align 8 %base, <1 x i1> %mask) ret void @@ -721,48 +213,20 @@ declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>) define void @compressstore_v2f64(ptr %base, <2 x double> %v, <2 x i1> %mask) { ; RV32-LABEL: compressstore_v2f64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB9_3 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: bnez a1, .LBB9_4 -; RV32-NEXT: .LBB9_2: # %else2 -; RV32-NEXT: ret -; RV32-NEXT: .LBB9_3: # %cond.store -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vse64.v v8, (a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: beqz a1, .LBB9_2 -; RV32-NEXT: .LBB9_4: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 1 -; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vcompress.vm v9, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV32-NEXT: vse64.v v9, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v2f64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB9_3 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: bnez a1, .LBB9_4 -; RV64-NEXT: .LBB9_2: # %else2 -; RV64-NEXT: ret -; RV64-NEXT: .LBB9_3: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: beqz a1, .LBB9_2 -; RV64-NEXT: .LBB9_4: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 1 -; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vcompress.vm v9, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vse64.v v9, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v2f64(<2 x double> %v, ptr align 8 %base, <2 x i1> %mask) ret void @@ -772,92 +236,20 @@ declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>) define void @compressstore_v4f64(ptr %base, <4 x double> %v, <4 x i1> %mask) { ; RV32-LABEL: compressstore_v4f64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB10_5 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB10_6 -; RV32-NEXT: .LBB10_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB10_7 -; RV32-NEXT: .LBB10_3: # %else5 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: bnez a1, .LBB10_8 -; RV32-NEXT: .LBB10_4: # %else8 -; RV32-NEXT: ret -; RV32-NEXT: .LBB10_5: # %cond.store -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vse64.v v8, (a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB10_2 -; RV32-NEXT: .LBB10_6: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vcompress.vm v10, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m2, ta, ma ; RV32-NEXT: vse64.v v10, (a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB10_3 -; RV32-NEXT: .LBB10_7: # %cond.store4 -; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 2 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vse64.v v10, (a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: beqz a1, .LBB10_4 -; RV32-NEXT: .LBB10_8: # %cond.store7 -; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 3 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v4f64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB10_5 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB10_6 -; RV64-NEXT: .LBB10_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB10_7 -; RV64-NEXT: .LBB10_3: # %else5 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: bnez a1, .LBB10_8 -; RV64-NEXT: .LBB10_4: # %else8 -; RV64-NEXT: ret -; RV64-NEXT: .LBB10_5: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB10_2 -; RV64-NEXT: .LBB10_6: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-NEXT: vse64.v v10, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB10_3 -; RV64-NEXT: .LBB10_7: # %cond.store4 -; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 2 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vcompress.vm v10, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma ; RV64-NEXT: vse64.v v10, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: beqz a1, .LBB10_4 -; RV64-NEXT: .LBB10_8: # %cond.store7 -; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 3 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret call void @llvm.masked.compressstore.v4f64(<4 x double> %v, ptr align 8 %base, <4 x i1> %mask) ret void @@ -867,213 +259,21 @@ declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>) define void @compressstore_v8f64(ptr %base, <8 x double> %v, <8 x i1> %mask) { ; RV32-LABEL: compressstore_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB11_11 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB11_12 -; RV32-NEXT: .LBB11_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB11_13 -; RV32-NEXT: .LBB11_3: # %else5 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: beqz a2, .LBB11_5 -; RV32-NEXT: .LBB11_4: # %cond.store7 -; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 3 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vse64.v v12, (a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: .LBB11_5: # %else8 -; RV32-NEXT: addi sp, sp, -320 -; RV32-NEXT: .cfi_def_cfa_offset 320 -; RV32-NEXT: sw ra, 316(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 312(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 320 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: bnez a2, .LBB11_14 -; RV32-NEXT: # %bb.6: # %else11 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: bnez a2, .LBB11_15 -; RV32-NEXT: .LBB11_7: # %else14 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: bnez a2, .LBB11_16 -; RV32-NEXT: .LBB11_8: # %else17 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: beqz a1, .LBB11_10 -; RV32-NEXT: .LBB11_9: # %cond.store19 -; RV32-NEXT: mv a1, sp ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vse64.v v8, (a1) -; RV32-NEXT: fld fa5, 56(sp) -; RV32-NEXT: fsd fa5, 0(a0) -; RV32-NEXT: .LBB11_10: # %else20 -; RV32-NEXT: addi sp, s0, -320 -; RV32-NEXT: lw ra, 316(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 312(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 320 -; RV32-NEXT: ret -; RV32-NEXT: .LBB11_11: # %cond.store -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vse64.v v8, (a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB11_2 -; RV32-NEXT: .LBB11_12: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 1 +; RV32-NEXT: vcompress.vm v12, v8, v0 +; RV32-NEXT: vcpop.m a1, v0 +; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV32-NEXT: vse64.v v12, (a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB11_3 -; RV32-NEXT: .LBB11_13: # %cond.store4 -; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 2 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vse64.v v12, (a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: bnez a2, .LBB11_4 -; RV32-NEXT: j .LBB11_5 -; RV32-NEXT: .LBB11_14: # %cond.store10 -; RV32-NEXT: addi a2, sp, 192 -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vse64.v v8, (a2) -; RV32-NEXT: fld fa5, 224(sp) -; RV32-NEXT: fsd fa5, 0(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: beqz a2, .LBB11_7 -; RV32-NEXT: .LBB11_15: # %cond.store13 -; RV32-NEXT: addi a2, sp, 128 -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vse64.v v8, (a2) -; RV32-NEXT: fld fa5, 168(sp) -; RV32-NEXT: fsd fa5, 0(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: beqz a2, .LBB11_8 -; RV32-NEXT: .LBB11_16: # %cond.store16 -; RV32-NEXT: addi a2, sp, 64 -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vse64.v v8, (a2) -; RV32-NEXT: fld fa5, 112(sp) -; RV32-NEXT: fsd fa5, 0(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: bnez a1, .LBB11_9 -; RV32-NEXT: j .LBB11_10 +; RV32-NEXT: ret ; ; RV64-LABEL: compressstore_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB11_11 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB11_12 -; RV64-NEXT: .LBB11_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB11_13 -; RV64-NEXT: .LBB11_3: # %else5 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: beqz a2, .LBB11_5 -; RV64-NEXT: .LBB11_4: # %cond.store7 -; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 3 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v12, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: .LBB11_5: # %else8 -; RV64-NEXT: addi sp, sp, -320 -; RV64-NEXT: .cfi_def_cfa_offset 320 -; RV64-NEXT: sd ra, 312(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 304(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 320 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: bnez a2, .LBB11_14 -; RV64-NEXT: # %bb.6: # %else11 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: bnez a2, .LBB11_15 -; RV64-NEXT: .LBB11_7: # %else14 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: bnez a2, .LBB11_16 -; RV64-NEXT: .LBB11_8: # %else17 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: beqz a1, .LBB11_10 -; RV64-NEXT: .LBB11_9: # %cond.store19 -; RV64-NEXT: mv a1, sp ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a1) -; RV64-NEXT: fld fa5, 56(sp) -; RV64-NEXT: fsd fa5, 0(a0) -; RV64-NEXT: .LBB11_10: # %else20 -; RV64-NEXT: addi sp, s0, -320 -; RV64-NEXT: ld ra, 312(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 304(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 320 -; RV64-NEXT: ret -; RV64-NEXT: .LBB11_11: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB11_2 -; RV64-NEXT: .LBB11_12: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 1 +; RV64-NEXT: vcompress.vm v12, v8, v0 +; RV64-NEXT: vcpop.m a1, v0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV64-NEXT: vse64.v v12, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB11_3 -; RV64-NEXT: .LBB11_13: # %cond.store4 -; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 2 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v12, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: bnez a2, .LBB11_4 -; RV64-NEXT: j .LBB11_5 -; RV64-NEXT: .LBB11_14: # %cond.store10 -; RV64-NEXT: addi a2, sp, 192 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: fld fa5, 224(sp) -; RV64-NEXT: fsd fa5, 0(a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: beqz a2, .LBB11_7 -; RV64-NEXT: .LBB11_15: # %cond.store13 -; RV64-NEXT: addi a2, sp, 128 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: fld fa5, 168(sp) -; RV64-NEXT: fsd fa5, 0(a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: beqz a2, .LBB11_8 -; RV64-NEXT: .LBB11_16: # %cond.store16 -; RV64-NEXT: addi a2, sp, 64 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: fld fa5, 112(sp) -; RV64-NEXT: fsd fa5, 0(a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: bnez a1, .LBB11_9 -; RV64-NEXT: j .LBB11_10 +; RV64-NEXT: ret call void @llvm.masked.compressstore.v8f64(<8 x double> %v, ptr align 8 %base, <8 x i1> %mask) ret void } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll index eb0096d..a388ba9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-int.ll @@ -6,13 +6,11 @@ declare void @llvm.masked.compressstore.v1i8(<1 x i8>, ptr, <1 x i1>) define void @compressstore_v1i8(ptr %base, <1 x i8> %v, <1 x i1> %mask) { ; CHECK-LABEL: compressstore_v1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vfirst.m a1, v0 -; CHECK-NEXT: bnez a1, .LBB0_2 -; CHECK-NEXT: # %bb.1: # %cond.store ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vse8.v v8, (a0) -; CHECK-NEXT: .LBB0_2: # %else +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vse8.v v9, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v1i8(<1 x i8> %v, ptr %base, <1 x i1> %mask) ret void @@ -22,25 +20,11 @@ declare void @llvm.masked.compressstore.v2i8(<2 x i8>, ptr, <2 x i1>) define void @compressstore_v2i8(ptr %base, <2 x i8> %v, <2 x i1> %mask) { ; CHECK-LABEL: compressstore_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB1_3 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: bnez a1, .LBB1_4 -; CHECK-NEXT: .LBB1_2: # %else2 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB1_3: # %cond.store -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vse8.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: beqz a1, .LBB1_2 -; CHECK-NEXT: .LBB1_4: # %cond.store1 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vse8.v v9, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v2i8(<2 x i8> %v, ptr %base, <2 x i1> %mask) ret void @@ -50,45 +34,11 @@ declare void @llvm.masked.compressstore.v4i8(<4 x i8>, ptr, <4 x i1>) define void @compressstore_v4i8(ptr %base, <4 x i8> %v, <4 x i1> %mask) { ; CHECK-LABEL: compressstore_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB2_5 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB2_6 -; CHECK-NEXT: .LBB2_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB2_7 -; CHECK-NEXT: .LBB2_3: # %else5 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: bnez a1, .LBB2_8 -; CHECK-NEXT: .LBB2_4: # %else8 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_5: # %cond.store -; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; CHECK-NEXT: vse8.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB2_2 -; CHECK-NEXT: .LBB2_6: # %cond.store1 -; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma ; CHECK-NEXT: vse8.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB2_3 -; CHECK-NEXT: .LBB2_7: # %cond.store4 -; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 2 -; CHECK-NEXT: vse8.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: beqz a1, .LBB2_4 -; CHECK-NEXT: .LBB2_8: # %cond.store7 -; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 3 -; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v4i8(<4 x i8> %v, ptr %base, <4 x i1> %mask) ret void @@ -98,85 +48,11 @@ declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>) define void @compressstore_v8i8(ptr %base, <8 x i8> %v, <8 x i1> %mask) { ; CHECK-LABEL: compressstore_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB3_9 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB3_10 -; CHECK-NEXT: .LBB3_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB3_11 -; CHECK-NEXT: .LBB3_3: # %else5 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: bnez a2, .LBB3_12 -; CHECK-NEXT: .LBB3_4: # %else8 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: bnez a2, .LBB3_13 -; CHECK-NEXT: .LBB3_5: # %else11 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: bnez a2, .LBB3_14 -; CHECK-NEXT: .LBB3_6: # %else14 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: bnez a2, .LBB3_15 -; CHECK-NEXT: .LBB3_7: # %else17 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: bnez a1, .LBB3_16 -; CHECK-NEXT: .LBB3_8: # %else20 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB3_9: # %cond.store -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vse8.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB3_2 -; CHECK-NEXT: .LBB3_10: # %cond.store1 -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 1 -; CHECK-NEXT: vse8.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB3_3 -; CHECK-NEXT: .LBB3_11: # %cond.store4 -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 2 -; CHECK-NEXT: vse8.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: beqz a2, .LBB3_4 -; CHECK-NEXT: .LBB3_12: # %cond.store7 -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 3 -; CHECK-NEXT: vse8.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: beqz a2, .LBB3_5 -; CHECK-NEXT: .LBB3_13: # %cond.store10 -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 4 -; CHECK-NEXT: vse8.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: beqz a2, .LBB3_6 -; CHECK-NEXT: .LBB3_14: # %cond.store13 -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 5 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma ; CHECK-NEXT: vse8.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: beqz a2, .LBB3_7 -; CHECK-NEXT: .LBB3_15: # %cond.store16 -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 6 -; CHECK-NEXT: vse8.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: beqz a1, .LBB3_8 -; CHECK-NEXT: .LBB3_16: # %cond.store19 -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v8i8(<8 x i8> %v, ptr %base, <8 x i1> %mask) ret void @@ -186,13 +62,11 @@ declare void @llvm.masked.compressstore.v1i16(<1 x i16>, ptr, <1 x i1>) define void @compressstore_v1i16(ptr %base, <1 x i16> %v, <1 x i1> %mask) { ; CHECK-LABEL: compressstore_v1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vfirst.m a1, v0 -; CHECK-NEXT: bnez a1, .LBB4_2 -; CHECK-NEXT: # %bb.1: # %cond.store ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: .LBB4_2: # %else +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v9, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v1i16(<1 x i16> %v, ptr align 2 %base, <1 x i1> %mask) ret void @@ -202,25 +76,11 @@ declare void @llvm.masked.compressstore.v2i16(<2 x i16>, ptr, <2 x i1>) define void @compressstore_v2i16(ptr %base, <2 x i16> %v, <2 x i1> %mask) { ; CHECK-LABEL: compressstore_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB5_3 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: bnez a1, .LBB5_4 -; CHECK-NEXT: .LBB5_2: # %else2 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB5_3: # %cond.store -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: beqz a1, .LBB5_2 -; CHECK-NEXT: .LBB5_4: # %cond.store1 -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v9, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v2i16(<2 x i16> %v, ptr align 2 %base, <2 x i1> %mask) ret void @@ -230,45 +90,11 @@ declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>) define void @compressstore_v4i16(ptr %base, <4 x i16> %v, <4 x i1> %mask) { ; CHECK-LABEL: compressstore_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB6_5 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB6_6 -; CHECK-NEXT: .LBB6_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB6_7 -; CHECK-NEXT: .LBB6_3: # %else5 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: bnez a1, .LBB6_8 -; CHECK-NEXT: .LBB6_4: # %else8 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB6_5: # %cond.store -; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB6_2 -; CHECK-NEXT: .LBB6_6: # %cond.store1 -; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 1 -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB6_3 -; CHECK-NEXT: .LBB6_7: # %cond.store4 -; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma ; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: beqz a1, .LBB6_4 -; CHECK-NEXT: .LBB6_8: # %cond.store7 -; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 3 -; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v4i16(<4 x i16> %v, ptr align 2 %base, <4 x i1> %mask) ret void @@ -278,85 +104,11 @@ declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>) define void @compressstore_v8i16(ptr %base, <8 x i16> %v, <8 x i1> %mask) { ; CHECK-LABEL: compressstore_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB7_9 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB7_10 -; CHECK-NEXT: .LBB7_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB7_11 -; CHECK-NEXT: .LBB7_3: # %else5 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: bnez a2, .LBB7_12 -; CHECK-NEXT: .LBB7_4: # %else8 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: bnez a2, .LBB7_13 -; CHECK-NEXT: .LBB7_5: # %else11 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: bnez a2, .LBB7_14 -; CHECK-NEXT: .LBB7_6: # %else14 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: bnez a2, .LBB7_15 -; CHECK-NEXT: .LBB7_7: # %else17 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: bnez a1, .LBB7_16 -; CHECK-NEXT: .LBB7_8: # %else20 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB7_9: # %cond.store -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB7_2 -; CHECK-NEXT: .LBB7_10: # %cond.store1 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 1 -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB7_3 -; CHECK-NEXT: .LBB7_11: # %cond.store4 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 2 -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: beqz a2, .LBB7_4 -; CHECK-NEXT: .LBB7_12: # %cond.store7 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 3 -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: beqz a2, .LBB7_5 -; CHECK-NEXT: .LBB7_13: # %cond.store10 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 4 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: beqz a2, .LBB7_6 -; CHECK-NEXT: .LBB7_14: # %cond.store13 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 5 -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: beqz a2, .LBB7_7 -; CHECK-NEXT: .LBB7_15: # %cond.store16 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 6 -; CHECK-NEXT: vse16.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 2 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: beqz a1, .LBB7_8 -; CHECK-NEXT: .LBB7_16: # %cond.store19 -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v8i16(<8 x i16> %v, ptr align 2 %base, <8 x i1> %mask) ret void @@ -366,13 +118,11 @@ declare void @llvm.masked.compressstore.v1i32(<1 x i32>, ptr, <1 x i1>) define void @compressstore_v1i32(ptr %base, <1 x i32> %v, <1 x i1> %mask) { ; CHECK-LABEL: compressstore_v1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vfirst.m a1, v0 -; CHECK-NEXT: bnez a1, .LBB8_2 -; CHECK-NEXT: # %bb.1: # %cond.store ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: .LBB8_2: # %else +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v1i32(<1 x i32> %v, ptr align 4 %base, <1 x i1> %mask) ret void @@ -382,25 +132,11 @@ declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>) define void @compressstore_v2i32(ptr %base, <2 x i32> %v, <2 x i1> %mask) { ; CHECK-LABEL: compressstore_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB9_3 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: bnez a1, .LBB9_4 -; CHECK-NEXT: .LBB9_2: # %else2 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB9_3: # %cond.store -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a1, a1, 2 -; CHECK-NEXT: beqz a1, .LBB9_2 -; CHECK-NEXT: .LBB9_4: # %cond.store1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v2i32(<2 x i32> %v, ptr align 4 %base, <2 x i1> %mask) ret void @@ -410,45 +146,11 @@ declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>) define void @compressstore_v4i32(ptr %base, <4 x i32> %v, <4 x i1> %mask) { ; CHECK-LABEL: compressstore_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB10_5 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB10_6 -; CHECK-NEXT: .LBB10_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB10_7 -; CHECK-NEXT: .LBB10_3: # %else5 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: bnez a1, .LBB10_8 -; CHECK-NEXT: .LBB10_4: # %else8 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB10_5: # %cond.store -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB10_2 -; CHECK-NEXT: .LBB10_6: # %cond.store1 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 1 -; CHECK-NEXT: vse32.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB10_3 -; CHECK-NEXT: .LBB10_7: # %cond.store4 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v9, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a1, a1, 8 -; CHECK-NEXT: beqz a1, .LBB10_4 -; CHECK-NEXT: .LBB10_8: # %cond.store7 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 3 -; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v4i32(<4 x i32> %v, ptr align 4 %base, <4 x i1> %mask) ret void @@ -458,89 +160,11 @@ declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>) define void @compressstore_v8i32(ptr %base, <8 x i32> %v, <8 x i1> %mask) { ; CHECK-LABEL: compressstore_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv.x.s a1, v0 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB11_9 -; CHECK-NEXT: # %bb.1: # %else -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: bnez a2, .LBB11_10 -; CHECK-NEXT: .LBB11_2: # %else2 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: bnez a2, .LBB11_11 -; CHECK-NEXT: .LBB11_3: # %else5 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: bnez a2, .LBB11_12 -; CHECK-NEXT: .LBB11_4: # %else8 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: bnez a2, .LBB11_13 -; CHECK-NEXT: .LBB11_5: # %else11 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: bnez a2, .LBB11_14 -; CHECK-NEXT: .LBB11_6: # %else14 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: bnez a2, .LBB11_15 -; CHECK-NEXT: .LBB11_7: # %else17 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: bnez a1, .LBB11_16 -; CHECK-NEXT: .LBB11_8: # %else20 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB11_9: # %cond.store -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 2 -; CHECK-NEXT: beqz a2, .LBB11_2 -; CHECK-NEXT: .LBB11_10: # %cond.store1 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 1 -; CHECK-NEXT: vse32.v v10, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 4 -; CHECK-NEXT: beqz a2, .LBB11_3 -; CHECK-NEXT: .LBB11_11: # %cond.store4 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 2 -; CHECK-NEXT: vse32.v v10, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 8 -; CHECK-NEXT: beqz a2, .LBB11_4 -; CHECK-NEXT: .LBB11_12: # %cond.store7 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 3 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vcompress.vm v10, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; CHECK-NEXT: vse32.v v10, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 16 -; CHECK-NEXT: beqz a2, .LBB11_5 -; CHECK-NEXT: .LBB11_13: # %cond.store10 -; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 4 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v10, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 32 -; CHECK-NEXT: beqz a2, .LBB11_6 -; CHECK-NEXT: .LBB11_14: # %cond.store13 -; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 5 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v10, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a2, a1, 64 -; CHECK-NEXT: beqz a2, .LBB11_7 -; CHECK-NEXT: .LBB11_15: # %cond.store16 -; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 6 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v10, (a0) -; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: andi a1, a1, -128 -; CHECK-NEXT: beqz a1, .LBB11_8 -; CHECK-NEXT: .LBB11_16: # %cond.store19 -; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 7 -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret call void @llvm.masked.compressstore.v8i32(<8 x i32> %v, ptr align 4 %base, <8 x i1> %mask) ret void @@ -548,439 +172,59 @@ define void @compressstore_v8i32(ptr %base, <8 x i32> %v, <8 x i1> %mask) { declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>) define void @compressstore_v1i64(ptr %base, <1 x i64> %v, <1 x i1> %mask) { -; RV32-LABEL: compressstore_v1i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vfirst.m a1, v0 -; RV32-NEXT: bnez a1, .LBB12_2 -; RV32-NEXT: # %bb.1: # %cond.store -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v9, v8, a1 -; RV32-NEXT: vmv.x.s a1, v9 -; RV32-NEXT: vmv.x.s a2, v8 -; RV32-NEXT: sw a2, 0(a0) -; RV32-NEXT: sw a1, 4(a0) -; RV32-NEXT: .LBB12_2: # %else -; RV32-NEXT: ret -; -; RV64-LABEL: compressstore_v1i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vfirst.m a1, v0 -; RV64-NEXT: bnez a1, .LBB12_2 -; RV64-NEXT: # %bb.1: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: .LBB12_2: # %else -; RV64-NEXT: ret +; CHECK-LABEL: compressstore_v1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v9, (a0) +; CHECK-NEXT: ret call void @llvm.masked.compressstore.v1i64(<1 x i64> %v, ptr align 8 %base, <1 x i1> %mask) ret void } declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>) define void @compressstore_v2i64(ptr %base, <2 x i64> %v, <2 x i1> %mask) { -; RV32-LABEL: compressstore_v2i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB13_3 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: bnez a1, .LBB13_4 -; RV32-NEXT: .LBB13_2: # %else2 -; RV32-NEXT: ret -; RV32-NEXT: .LBB13_3: # %cond.store -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v9, v8, a2 -; RV32-NEXT: vmv.x.s a2, v9 -; RV32-NEXT: vmv.x.s a3, v8 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, 2 -; RV32-NEXT: beqz a1, .LBB13_2 -; RV32-NEXT: .LBB13_4: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 1 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v9, v8, a1 -; RV32-NEXT: vmv.x.s a1, v9 -; RV32-NEXT: vmv.x.s a2, v8 -; RV32-NEXT: sw a2, 0(a0) -; RV32-NEXT: sw a1, 4(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: compressstore_v2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB13_3 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: bnez a1, .LBB13_4 -; RV64-NEXT: .LBB13_2: # %else2 -; RV64-NEXT: ret -; RV64-NEXT: .LBB13_3: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, 2 -; RV64-NEXT: beqz a1, .LBB13_2 -; RV64-NEXT: .LBB13_4: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 1 -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: ret +; CHECK-LABEL: compressstore_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vcompress.vm v9, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vse64.v v9, (a0) +; CHECK-NEXT: ret call void @llvm.masked.compressstore.v2i64(<2 x i64> %v, ptr align 8 %base, <2 x i1> %mask) ret void } declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>) define void @compressstore_v4i64(ptr %base, <4 x i64> %v, <4 x i1> %mask) { -; RV32-LABEL: compressstore_v4i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB14_5 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB14_6 -; RV32-NEXT: .LBB14_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB14_7 -; RV32-NEXT: .LBB14_3: # %else5 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: bnez a1, .LBB14_8 -; RV32-NEXT: .LBB14_4: # %else8 -; RV32-NEXT: ret -; RV32-NEXT: .LBB14_5: # %cond.store -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vsrl.vx v10, v8, a2 -; RV32-NEXT: vmv.x.s a2, v10 -; RV32-NEXT: vmv.x.s a3, v8 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB14_2 -; RV32-NEXT: .LBB14_6: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 1 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v12, v10, a2 -; RV32-NEXT: vmv.x.s a2, v12 -; RV32-NEXT: vmv.x.s a3, v10 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB14_3 -; RV32-NEXT: .LBB14_7: # %cond.store4 -; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 2 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v12, v10, a2 -; RV32-NEXT: vmv.x.s a2, v12 -; RV32-NEXT: vmv.x.s a3, v10 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, 8 -; RV32-NEXT: beqz a1, .LBB14_4 -; RV32-NEXT: .LBB14_8: # %cond.store7 -; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 3 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v10, v8, a1 -; RV32-NEXT: vmv.x.s a1, v10 -; RV32-NEXT: vmv.x.s a2, v8 -; RV32-NEXT: sw a2, 0(a0) -; RV32-NEXT: sw a1, 4(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: compressstore_v4i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB14_5 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB14_6 -; RV64-NEXT: .LBB14_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB14_7 -; RV64-NEXT: .LBB14_3: # %else5 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: bnez a1, .LBB14_8 -; RV64-NEXT: .LBB14_4: # %else8 -; RV64-NEXT: ret -; RV64-NEXT: .LBB14_5: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB14_2 -; RV64-NEXT: .LBB14_6: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 1 -; RV64-NEXT: vse64.v v10, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB14_3 -; RV64-NEXT: .LBB14_7: # %cond.store4 -; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v10, v8, 2 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v10, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, 8 -; RV64-NEXT: beqz a1, .LBB14_4 -; RV64-NEXT: .LBB14_8: # %cond.store7 -; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 3 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: ret +; CHECK-LABEL: compressstore_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vcompress.vm v10, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vse64.v v10, (a0) +; CHECK-NEXT: ret call void @llvm.masked.compressstore.v4i64(<4 x i64> %v, ptr align 8 %base, <4 x i1> %mask) ret void } declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>) define void @compressstore_v8i64(ptr %base, <8 x i64> %v, <8 x i1> %mask) { -; RV32-LABEL: compressstore_v8i64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV32-NEXT: vmv.x.s a1, v0 -; RV32-NEXT: andi a2, a1, 1 -; RV32-NEXT: bnez a2, .LBB15_9 -; RV32-NEXT: # %bb.1: # %else -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: bnez a2, .LBB15_10 -; RV32-NEXT: .LBB15_2: # %else2 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: bnez a2, .LBB15_11 -; RV32-NEXT: .LBB15_3: # %else5 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: bnez a2, .LBB15_12 -; RV32-NEXT: .LBB15_4: # %else8 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: bnez a2, .LBB15_13 -; RV32-NEXT: .LBB15_5: # %else11 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: bnez a2, .LBB15_14 -; RV32-NEXT: .LBB15_6: # %else14 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: bnez a2, .LBB15_15 -; RV32-NEXT: .LBB15_7: # %else17 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: bnez a1, .LBB15_16 -; RV32-NEXT: .LBB15_8: # %else20 -; RV32-NEXT: ret -; RV32-NEXT: .LBB15_9: # %cond.store -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vsrl.vx v12, v8, a2 -; RV32-NEXT: vmv.x.s a2, v12 -; RV32-NEXT: vmv.x.s a3, v8 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 2 -; RV32-NEXT: beqz a2, .LBB15_2 -; RV32-NEXT: .LBB15_10: # %cond.store1 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 1 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v16, v12, a2 -; RV32-NEXT: vmv.x.s a2, v16 -; RV32-NEXT: vmv.x.s a3, v12 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 4 -; RV32-NEXT: beqz a2, .LBB15_3 -; RV32-NEXT: .LBB15_11: # %cond.store4 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 2 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v16, v12, a2 -; RV32-NEXT: vmv.x.s a2, v16 -; RV32-NEXT: vmv.x.s a3, v12 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 8 -; RV32-NEXT: beqz a2, .LBB15_4 -; RV32-NEXT: .LBB15_12: # %cond.store7 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 3 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v16, v12, a2 -; RV32-NEXT: vmv.x.s a2, v16 -; RV32-NEXT: vmv.x.s a3, v12 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 16 -; RV32-NEXT: beqz a2, .LBB15_5 -; RV32-NEXT: .LBB15_13: # %cond.store10 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 4 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v16, v12, a2 -; RV32-NEXT: vmv.x.s a2, v16 -; RV32-NEXT: vmv.x.s a3, v12 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 32 -; RV32-NEXT: beqz a2, .LBB15_6 -; RV32-NEXT: .LBB15_14: # %cond.store13 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 5 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v16, v12, a2 -; RV32-NEXT: vmv.x.s a2, v16 -; RV32-NEXT: vmv.x.s a3, v12 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a2, a1, 64 -; RV32-NEXT: beqz a2, .LBB15_7 -; RV32-NEXT: .LBB15_15: # %cond.store16 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 6 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v16, v12, a2 -; RV32-NEXT: vmv.x.s a2, v16 -; RV32-NEXT: vmv.x.s a3, v12 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a2, 4(a0) -; RV32-NEXT: addi a0, a0, 8 -; RV32-NEXT: andi a1, a1, -128 -; RV32-NEXT: beqz a1, .LBB15_8 -; RV32-NEXT: .LBB15_16: # %cond.store19 -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 7 -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v12, v8, a1 -; RV32-NEXT: vmv.x.s a1, v12 -; RV32-NEXT: vmv.x.s a2, v8 -; RV32-NEXT: sw a2, 0(a0) -; RV32-NEXT: sw a1, 4(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: compressstore_v8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64-NEXT: vmv.x.s a1, v0 -; RV64-NEXT: andi a2, a1, 1 -; RV64-NEXT: bnez a2, .LBB15_11 -; RV64-NEXT: # %bb.1: # %else -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: bnez a2, .LBB15_12 -; RV64-NEXT: .LBB15_2: # %else2 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: bnez a2, .LBB15_13 -; RV64-NEXT: .LBB15_3: # %else5 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: beqz a2, .LBB15_5 -; RV64-NEXT: .LBB15_4: # %cond.store7 -; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 3 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v12, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: .LBB15_5: # %else8 -; RV64-NEXT: addi sp, sp, -320 -; RV64-NEXT: .cfi_def_cfa_offset 320 -; RV64-NEXT: sd ra, 312(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 304(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 320 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: andi a2, a1, 16 -; RV64-NEXT: bnez a2, .LBB15_14 -; RV64-NEXT: # %bb.6: # %else11 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: bnez a2, .LBB15_15 -; RV64-NEXT: .LBB15_7: # %else14 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: bnez a2, .LBB15_16 -; RV64-NEXT: .LBB15_8: # %else17 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: beqz a1, .LBB15_10 -; RV64-NEXT: .LBB15_9: # %cond.store19 -; RV64-NEXT: mv a1, sp -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a1) -; RV64-NEXT: ld a1, 56(sp) -; RV64-NEXT: sd a1, 0(a0) -; RV64-NEXT: .LBB15_10: # %else20 -; RV64-NEXT: addi sp, s0, -320 -; RV64-NEXT: ld ra, 312(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 304(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 320 -; RV64-NEXT: ret -; RV64-NEXT: .LBB15_11: # %cond.store -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 2 -; RV64-NEXT: beqz a2, .LBB15_2 -; RV64-NEXT: .LBB15_12: # %cond.store1 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 1 -; RV64-NEXT: vse64.v v12, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 4 -; RV64-NEXT: beqz a2, .LBB15_3 -; RV64-NEXT: .LBB15_13: # %cond.store4 -; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV64-NEXT: vslidedown.vi v12, v8, 2 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vse64.v v12, (a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 8 -; RV64-NEXT: bnez a2, .LBB15_4 -; RV64-NEXT: j .LBB15_5 -; RV64-NEXT: .LBB15_14: # %cond.store10 -; RV64-NEXT: addi a2, sp, 192 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: ld a2, 224(sp) -; RV64-NEXT: sd a2, 0(a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 32 -; RV64-NEXT: beqz a2, .LBB15_7 -; RV64-NEXT: .LBB15_15: # %cond.store13 -; RV64-NEXT: addi a2, sp, 128 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: ld a2, 168(sp) -; RV64-NEXT: sd a2, 0(a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a2, a1, 64 -; RV64-NEXT: beqz a2, .LBB15_8 -; RV64-NEXT: .LBB15_16: # %cond.store16 -; RV64-NEXT: addi a2, sp, 64 -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: ld a2, 112(sp) -; RV64-NEXT: sd a2, 0(a0) -; RV64-NEXT: addi a0, a0, 8 -; RV64-NEXT: andi a1, a1, -128 -; RV64-NEXT: bnez a1, .LBB15_9 -; RV64-NEXT: j .LBB15_10 +; CHECK-LABEL: compressstore_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vcompress.vm v12, v8, v0 +; CHECK-NEXT: vcpop.m a1, v0 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vse64.v v12, (a0) +; CHECK-NEXT: ret call void @llvm.masked.compressstore.v8i64(<8 x i64> %v, ptr align 8 %base, <8 x i1> %mask) ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/SPIRV/ComparePointers.ll b/llvm/test/CodeGen/SPIRV/ComparePointers.ll index fd2084d..9be0594 100644 --- a/llvm/test/CodeGen/SPIRV/ComparePointers.ll +++ b/llvm/test/CodeGen/SPIRV/ComparePointers.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --mattr=+spirv1.3 %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; kernel void test(int global *in, int global *in2) { ;; if (!in) diff --git a/llvm/test/CodeGen/SPIRV/capability-kernel.ll b/llvm/test/CodeGen/SPIRV/capability-kernel.ll index 03ea58c..fea1951 100644 --- a/llvm/test/CodeGen/SPIRV/capability-kernel.ll +++ b/llvm/test/CodeGen/SPIRV/capability-kernel.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: OpCapability Addresses diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll index 062863a..7e9c621 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#INT8:]] = OpTypeInt 8 0 ; CHECK: %[[#PTR1:]] = OpTypePointer CrossWorkgroup %[[#INT8]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll index aaf97f8..fc999ba 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#FLOAT32:]] = OpTypeFloat 32 ; CHECK: %[[#PTR:]] = OpTypePointer CrossWorkgroup %[[#FLOAT32]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll index 6d12023..a3a730a 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#INT:]] = OpTypeInt 32 0 ; CHECK-DAG: %[[#PTR1:]] = OpTypePointer Function %[[#INT]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll index 9e136ce..b74a344 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#IMAGE:]] = OpTypeImage %2 2D 0 0 0 0 Unknown ReadOnly diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll index 1fcc6d9..b8f205a 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#FLOAT32:]] = OpTypeFloat 32 ; CHECK-DAG: %[[#PTR1:]] = OpTypePointer Function %[[#FLOAT32]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll index 1b4e7a3..1667abc 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[#INT8:]] = OpTypeInt 8 0 ; CHECK: %[[#PTR1:]] = OpTypePointer CrossWorkgroup %[[#INT8]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll b/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll index 00b03c0..3a0d65e 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; TODO: OpFunctionParameter should be a pointer of struct base type. ; XFAIL: * diff --git a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll index 86f5f5b..d426fc4 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll @@ -1,5 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s -; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: %[[TyInt8:.*]] = OpTypeInt 8 0 ; CHECK: %[[TyInt8Ptr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt8]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll b/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll index 52180d5..23c3faa 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#INT:]] = OpTypeInt 32 ; CHECK-DAG: %[[#GLOBAL_PTR_INT:]] = OpTypePointer CrossWorkgroup %[[#INT]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll b/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll index 473c2a8..83234e3 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[#float:]] = OpTypeFloat 32 ; CHECK-DAG: %[[#pointer:]] = OpTypePointer CrossWorkgroup %[[#float]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll new file mode 100644 index 0000000..76769ab --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll @@ -0,0 +1,28 @@ +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "known_type_ptr" +; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo" +; CHECK-SPIRV-DAG: OpName %[[ArgToDeduce:.*]] "unknown_type_ptr" +; CHECK-SPIRV-DAG: OpName %[[Bar:.*]] "bar" +; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 32 0 +; CHECK-SPIRV-DAG: %[[Void:.*]] = OpTypeVoid +; CHECK-SPIRV-DAG: %[[LongPtr:.*]] = OpTypePointer CrossWorkgroup %[[Long]] +; CHECK-SPIRV-DAG: %[[Fun:.*]] = OpTypeFunction %[[Void]] %[[LongPtr]] +; CHECK-SPIRV: %[[Bar]] = OpFunction %[[Void]] None %[[Fun]] +; CHECK-SPIRV: %[[ArgToDeduce]] = OpFunctionParameter %[[LongPtr]] +; CHECK-SPIRV: OpFunctionCall %[[Void]] %[[Foo]] %[[ArgToDeduce]] +; CHECK-SPIRV: %[[Foo]] = OpFunction %[[Void]] None %[[Fun]] +; CHECK-SPIRV: %[[FooArg]] = OpFunctionParameter %[[LongPtr]] + +define spir_kernel void @bar(ptr addrspace(1) %unknown_type_ptr) { +entry: + call spir_func void @foo(ptr addrspace(1) %unknown_type_ptr) + ret void +} + +define void @foo(ptr addrspace(1) %known_type_ptr) { +entry: + %elem = getelementptr inbounds i32, ptr addrspace(1) %known_type_ptr, i64 0 + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll new file mode 100644 index 0000000..8cbf360 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll @@ -0,0 +1,28 @@ +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "known_type_ptr" +; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo" +; CHECK-SPIRV-DAG: OpName %[[ArgToDeduce:.*]] "unknown_type_ptr" +; CHECK-SPIRV-DAG: OpName %[[Bar:.*]] "bar" +; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 32 0 +; CHECK-SPIRV-DAG: %[[Void:.*]] = OpTypeVoid +; CHECK-SPIRV-DAG: %[[LongPtr:.*]] = OpTypePointer CrossWorkgroup %[[Long]] +; CHECK-SPIRV-DAG: %[[Fun:.*]] = OpTypeFunction %[[Void]] %[[LongPtr]] +; CHECK-SPIRV: %[[Foo]] = OpFunction %[[Void]] None %[[Fun]] +; CHECK-SPIRV: %[[FooArg]] = OpFunctionParameter %[[LongPtr]] +; CHECK-SPIRV: %[[Bar]] = OpFunction %[[Void]] None %[[Fun]] +; CHECK-SPIRV: %[[ArgToDeduce]] = OpFunctionParameter %[[LongPtr]] +; CHECK-SPIRV: OpFunctionCall %[[Void]] %[[Foo]] %[[ArgToDeduce]] + +define void @foo(ptr addrspace(1) %known_type_ptr) { +entry: + %elem = getelementptr inbounds i32, ptr addrspace(1) %known_type_ptr, i64 0 + ret void +} + +define spir_kernel void @bar(ptr addrspace(1) %unknown_type_ptr) { +entry: + call spir_func void @foo(ptr addrspace(1) %unknown_type_ptr) + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll b/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll new file mode 100644 index 0000000..f144418 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll @@ -0,0 +1,29 @@ +; This test is to check that two functions have different SPIR-V type +; definitions, even though their LLVM function types are identical. + +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: OpName %[[Fun32:.*]] "tp_arg_i32" +; CHECK-DAG: OpName %[[Fun64:.*]] "tp_arg_i64" +; CHECK-DAG: %[[TyI32:.*]] = OpTypeInt 32 0 +; CHECK-DAG: %[[TyVoid:.*]] = OpTypeVoid +; CHECK-DAG: %[[TyPtr32:.*]] = OpTypePointer Function %[[TyI32]] +; CHECK-DAG: %[[TyFun32:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtr32]] +; CHECK-DAG: %[[TyI64:.*]] = OpTypeInt 64 0 +; CHECK-DAG: %[[TyPtr64:.*]] = OpTypePointer Function %[[TyI64]] +; CHECK-DAG: %[[TyFun64:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtr64]] +; CHECK-DAG: %[[Fun32]] = OpFunction %[[TyVoid]] None %[[TyFun32]] +; CHECK-DAG: %[[Fun64]] = OpFunction %[[TyVoid]] None %[[TyFun64]] + +define spir_kernel void @tp_arg_i32(ptr %ptr) { +entry: + store i32 1, ptr %ptr + ret void +} + +define spir_kernel void @tp_arg_i64(ptr %ptr) { +entry: + store i64 1, ptr %ptr + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/relationals.ll b/llvm/test/CodeGen/SPIRV/relationals.ll index 1644dc7..f4fcf4d 100644 --- a/llvm/test/CodeGen/SPIRV/relationals.ll +++ b/llvm/test/CodeGen/SPIRV/relationals.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} declare dso_local spir_func <4 x i8> @_Z13__spirv_IsNanIDv4_aDv4_fET_T0_(<4 x float>) declare dso_local spir_func <4 x i8> @_Z13__spirv_IsInfIDv4_aDv4_fET_T0_(<4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/simple.ll b/llvm/test/CodeGen/SPIRV/simple.ll index de9efa8..63c1596 100644 --- a/llvm/test/CodeGen/SPIRV/simple.ll +++ b/llvm/test/CodeGen/SPIRV/simple.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; Support of doubles is required. ; CHECK: OpCapability Float64 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll index fdb26ba..55cfcea 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; __kernel void testAtomicCompareExchangeExplicit_cl20( ;; volatile global atomic_int* object, diff --git a/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll b/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll index 55161e6..11b0578 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK: OpDecorate %[[#FUNC_NAME:]] LinkageAttributes "_Z10BitReversei" ; CHECK-NOT: OpBitReverse diff --git a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll index 95f3673..b63c1c6 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: %[[#]] = OpBuildNDRange %[[#]] %[[#GWS:]] %[[#LWS:]] %[[#GWO:]] ; CHECK-SPIRV-DAG: %[[#GWS]] = OpConstant %[[#]] 123 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll index a2ae808..65c992c 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll @@ -19,6 +19,7 @@ ;; bash$ $PATH_TO_GEN/bin/clang -cc1 -x cl -cl-std=CL2.0 -triple spir64-unknown-unknown -emit-llvm -include opencl-20.h BuildNDRange_2.cl -o BuildNDRange_2.ll ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; TODO(#60133): Requires updates following opaque pointer migration. ; XFAIL: * diff --git a/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll b/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll index 3403695..93aecc5 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; kernel void testConvertPtrToU(global int *a, global unsigned long *res) { ;; res[0] = (unsigned long)&a[0]; diff --git a/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll b/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll index 2e9b4a4..d4fc5c3 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpDecorate %[[#ALIGNMENT:]] Alignment 16 ; CHECK-SPIRV: %[[#ALIGNMENT]] = OpFunctionParameter %[[#]] diff --git a/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll b/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll index 64f25b7..966d835 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpName %[[#PTR_ID:]] "ptr" ; CHECK-SPIRV: OpName %[[#PTR2_ID:]] "ptr2" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll b/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll index 2f423c2..67c3380 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: %[[#int:]] = OpTypeInt 32 0 ; CHECK-SPIRV-DAG: %[[#int2:]] = OpTypeVector %[[#int]] 2 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll b/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll index 6d6dd24..6e8726c 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: OpEntryPoint Kernel %[[#WORKER:]] "worker" ; CHECK-SPIRV-DAG: OpExecutionMode %[[#WORKER]] LocalSizeHint 128 10 1 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll b/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll index 2796dcb..33bece5 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpDecorate %[[#]] UserSemantic "annotation_on_function" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll index 331960c..417b89e 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; This test checks that the backend is capable to correctly translate ;; atomic_cmpxchg OpenCL C 1.2 built-in function [1] into corresponding SPIR-V diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll index 95eb6ad..3180b57 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; This test checks that the backend is capable to correctly translate ;; legacy atomic OpenCL C 1.2 built-in functions [1] into corresponding SPIR-V diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll index 0f3a62a..c94c130 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; This test checks that the backend is capable to correctly translate ;; atomic_work_item_fence OpenCL C 2.0 built-in function [1] into corresponding diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll index a126d94..cf4a247 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; This test checks that the backend is capable to correctly translate ;; barrier OpenCL C 1.2 built-in function [1] into corresponding SPIR-V diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll index 42b127c..5d9840d 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpCapability GroupNonUniformBallot ; CHECK-SPIRV: OpDecorate %[[#]] BuiltIn SubgroupGtMask diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll index 0874e6f..0702fd0 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; This test checks that the backend is capable to correctly translate ;; sub_group_barrier built-in function [1] from cl_khr_subgroups extension into diff --git a/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll b/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll index 3c563d3..20204ac 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; Types: ; CHECK-DAG: %[[#INT:]] = OpTypeInt 32 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll b/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll index d013abc..3e5a3ac 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; Check 'LLVM ==> SPIR-V' conversion of atomic_load and atomic_store. diff --git a/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll b/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll index 8dbf4d2..2c0fc39 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; Check the bitcast is translated back to bitcast diff --git a/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll b/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll index 5ecd7f7..2249cbe 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV1_4 +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; TODO(#60133): Requires updates following opaque pointer migration. ; XFAIL: * diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll index 9b1ce76..0a02a8b 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalInvocationId ; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll index 8286671..f18f27a 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId ; CHECK-SPIRV: %[[#Id:]] = OpVariable %[[#]] diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll index 22aa40c..d39ca3c 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; The IR was generated from the following source: ;; #include <CL/sycl.hpp> diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll index 5b3474f..03456ae 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; The IR was generated from the following source: ;; #include <CL/sycl.hpp> diff --git a/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll b/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll index 6de610b..824ca1b2 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: %[[#IMAGE_TYPE:]] = OpTypeImage ; CHECK-SPIRV: %[[#IMAGE_ARG:]] = OpFunctionParameter %[[#IMAGE_TYPE]] diff --git a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll index 52b7dac..d7e87c0 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll @@ -19,6 +19,7 @@ ;; } ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: OpCapability Sampled1D ; CHECK-SPIRV-DAG: OpCapability SampledBuffer diff --git a/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll b/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll index 9054454..0cd75bb 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpTypeDeviceEvent ; CHECK-SPIRV: OpFunction diff --git a/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll b/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll index cf124ec..d23b068 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; TODO(#60133): Requires updates following opaque pointer migration. ; XFAIL: * diff --git a/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll b/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll index c186a81..49b84c1 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpSatConvertSToU diff --git a/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll b/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll index fd29bc8..0ed1dc7 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; TODO(#60133): Requires updates following opaque pointer migration. ; XFAIL: * diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll b/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll index 78d9a23..af76c0e 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpName %[[#r1:]] "r1" ; CHECK-SPIRV: OpName %[[#r2:]] "r2" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll b/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll index cfdcc72..550ec1a 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: %[[#]] = OpExtInst %[[#]] %[[#]] fclamp ; CHECK-SPIRV-NOT: %[[#]] = OpExtInst %[[#]] %[[#]] clamp diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll b/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll index 572ccc3..46eaba9 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpName %[[#r1:]] "r1" ; CHECK-SPIRV: OpName %[[#r2:]] "r2" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll b/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll index d0ed564..79b7868 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpName %[[#r1:]] "r1" ; CHECK-SPIRV: OpName %[[#r2:]] "r2" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll b/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll index f506787b..683b5c2 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll @@ -2,6 +2,7 @@ ;; { out = fmod( in1, in2 ); } ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: %[[#]] = OpExtInst %[[#]] %[[#]] fmod %[[#]] %[[#]] diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll b/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll index 886077a..fdab29c 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpName %[[#r1:]] "r1" ; CHECK-SPIRV: OpName %[[#r2:]] "r2" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll b/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll index e17601a..60bbfe6 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpName %[[#r1:]] "r1" ; CHECK-SPIRV: OpName %[[#r2:]] "r2" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll b/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll index c035c35..974043c 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-NOT: OpCapability FPFastMathModeINTEL ; CHECK-SPIRV: OpName %[[#mu:]] "mul" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/frem.ll b/llvm/test/CodeGen/SPIRV/transcoding/frem.ll index ecb8f6f..d36ba7f 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/frem.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/frem.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpName %[[#r1:]] "r1" ; CHECK-SPIRV: OpName %[[#r2:]] "r2" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll b/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll index 99d0d0e..3677c00 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpName %[[#r1:]] "r1" ; CHECK-SPIRV: OpName %[[#r2:]] "r2" diff --git a/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll b/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll index dc307c7..fd24196 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; Types: ; CHECK-DAG: %[[#INT:]] = OpTypeInt 32 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll b/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll index 2f44e19..ff1bec4 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV1_4 +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; TODO(#60133): Requires updates following opaque pointer migration. ; XFAIL: * diff --git a/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll b/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll index 6aa9faa..2412f40 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-DAG: %[[#int:]] = OpTypeInt 32 0 ; CHECK-SPIRV-DAG: %[[#float:]] = OpTypeFloat 32 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll b/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll index 3c818af..c5f3f9e 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV-NOT: OpSConvert diff --git a/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll b/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll index f771854..de7673a 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; This test checks following SYCL relational builtins with double and double2 ;; types: diff --git a/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll b/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll index 1f55ceb..69a4a30 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; This test checks following SYCL relational builtins with float and float2 ;; types: diff --git a/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll b/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll index 864fb4f..d6a7fda 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll @@ -1,4 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} ;; This test checks following SYCL relational builtins with half and half2 types: ;; isfinite, isinf, isnan, isnormal, signbit, isequal, isnotequal, isgreater diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll index 085cde8..7a5baa0 100644 --- a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll +++ b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll @@ -97,7 +97,7 @@ $_ZTI7Derived = comdat any ; Function Attrs: nounwind uwtable define weak_odr dso_local dllexport void @_ZN4BaseC2Ev(ptr noundef nonnull align 8 dereferenceable(12) %0) unnamed_addr #0 comdat align 2 { - store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5 + store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5 %2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1 store i32 0, ptr %2, align 8, !tbaa !8 ret void @@ -105,7 +105,7 @@ define weak_odr dso_local dllexport void @_ZN4BaseC2Ev(ptr noundef nonnull align ; Function Attrs: nounwind uwtable define weak_odr dso_local dllexport void @_ZN4BaseC1Ev(ptr noundef nonnull align 8 dereferenceable(12) %0) unnamed_addr #0 comdat align 2 { - store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5 + store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5 %2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1 store i32 0, ptr %2, align 8, !tbaa !8 ret void @@ -140,10 +140,10 @@ declare dso_local void @_ZdlPv(ptr noundef) local_unnamed_addr #2 ; Function Attrs: nounwind uwtable define weak_odr dso_local dllexport void @_ZN7DerivedC2Ev(ptr noundef nonnull align 8 dereferenceable(16) %0) unnamed_addr #0 comdat align 2 { - store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5 + store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5 %2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1 store i32 0, ptr %2, align 8, !tbaa !8 - store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5 + store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5 %3 = getelementptr inbounds %class.Derived, ptr %0, i64 0, i32 1 store i32 0, ptr %3, align 4, !tbaa !12 ret void @@ -151,10 +151,10 @@ define weak_odr dso_local dllexport void @_ZN7DerivedC2Ev(ptr noundef nonnull al ; Function Attrs: nounwind uwtable define weak_odr dso_local dllexport void @_ZN7DerivedC1Ev(ptr noundef nonnull align 8 dereferenceable(16) %0) unnamed_addr #0 comdat align 2 { - store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5 + store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5 %2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1 store i32 0, ptr %2, align 8, !tbaa !8 - store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5 + store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5 %3 = getelementptr inbounds %class.Derived, ptr %0, i64 0, i32 1 store i32 0, ptr %3, align 4, !tbaa !12 ret void diff --git a/llvm/test/CodeGen/X86/tls-align.ll b/llvm/test/CodeGen/X86/tls-align.ll index 3c8ee6b..e996c00 100644 --- a/llvm/test/CodeGen/X86/tls-align.ll +++ b/llvm/test/CodeGen/X86/tls-align.ll @@ -12,7 +12,7 @@ define internal fastcc void @foo() unnamed_addr { entry: - store <8 x ptr> <ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null>, ptr @array, align 32 + store <8 x ptr> <ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null>, ptr @array, align 32 ret void } |