diff options
Diffstat (limited to 'llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll')
-rw-r--r-- | llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll | 1590 |
1 files changed, 934 insertions, 656 deletions
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index 92cb51b..f871e403 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -2,19 +2,18 @@ ; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM80 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ +; RUN: %if ptxas-sm_80 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_80 %} -; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \ +; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \ +; RUN: %if ptxas-sm_100 && ptxas-isa-8.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx88 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" -; Check straight line reduction. define half @reduce_fadd_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fadd_half( ; CHECK: { @@ -43,45 +42,22 @@ define half @reduce_fadd_half(<8 x half> %in) { } define half @reduce_fadd_half_reassoc(<8 x half> %in) { -; CHECK-SM80-LABEL: reduce_fadd_half_reassoc( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<6>; -; CHECK-SM80-NEXT: .reg .b32 %r<10>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-SM80-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM80-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM80-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: add.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-SM80-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_fadd_half_reassoc( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<6>; -; CHECK-SM100-NEXT: .reg .b32 %r<10>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-SM100-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM100-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM100-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: add.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-SM100-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_fadd_half_reassoc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<6>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-NEXT: add.rn.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: add.rn.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: add.rn.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: add.rn.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: mov.b16 %rs4, 0x0000; +; CHECK-NEXT: add.rn.f16 %rs5, %rs3, %rs4; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in) ret half %res } @@ -109,30 +85,47 @@ define half @reduce_fadd_half_reassoc_nonpow2(<7 x half> %in) { ret half %res } -; Check straight-line reduction. define float @reduce_fadd_float(<8 x float> %in) { -; CHECK-LABEL: reduce_fadd_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<17>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0]; -; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; -; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; -; CHECK-NEXT: add.rn.f32 %r9, %r7, 0f00000000; -; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8; -; CHECK-NEXT: add.rn.f32 %r11, %r10, %r5; -; CHECK-NEXT: add.rn.f32 %r12, %r11, %r6; -; CHECK-NEXT: add.rn.f32 %r13, %r12, %r3; -; CHECK-NEXT: add.rn.f32 %r14, %r13, %r4; -; CHECK-NEXT: add.rn.f32 %r15, %r14, %r1; -; CHECK-NEXT: add.rn.f32 %r16, %r15, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r16; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fadd_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<17>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0]; +; CHECK-SM80-NEXT: add.rn.f32 %r9, %r1, 0f00000000; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r2; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r3; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r11, %r4; +; CHECK-SM80-NEXT: add.rn.f32 %r13, %r12, %r5; +; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r7; +; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, %r8; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r16; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fadd_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<17>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-SM100-NEXT: add.rn.f32 %r9, %r7, 0f00000000; +; CHECK-SM100-NEXT: add.rn.f32 %r10, %r9, %r8; +; CHECK-SM100-NEXT: add.rn.f32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: add.rn.f32 %r12, %r11, %r6; +; CHECK-SM100-NEXT: add.rn.f32 %r13, %r12, %r3; +; CHECK-SM100-NEXT: add.rn.f32 %r14, %r13, %r4; +; CHECK-SM100-NEXT: add.rn.f32 %r15, %r14, %r1; +; CHECK-SM100-NEXT: add.rn.f32 %r16, %r15, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r16; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) ret float %res } @@ -141,22 +134,17 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fadd_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<17>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: add.rn.f32 %r5, %r3, %r1; -; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-SM80-NEXT: add.rn.f32 %r10, %r8, %r6; -; CHECK-SM80-NEXT: add.rn.f32 %r11, %r4, %r2; -; CHECK-SM80-NEXT: add.rn.f32 %r12, %r9, %r7; -; CHECK-SM80-NEXT: add.rn.f32 %r13, %r12, %r11; -; CHECK-SM80-NEXT: add.rn.f32 %r14, %r10, %r5; -; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r13; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0]; +; CHECK-SM80-NEXT: add.rn.f32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: add.rn.f32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r11; ; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, 0f00000000; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r16; ; CHECK-SM80-NEXT: ret; @@ -164,7 +152,7 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) { ; CHECK-SM100-LABEL: reduce_fadd_float_reassoc( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b32 %r<5>; -; CHECK-SM100-NEXT: .reg .b64 %rd<10>; +; CHECK-SM100-NEXT: .reg .b64 %rd<8>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16]; @@ -172,11 +160,8 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) { ; CHECK-SM100-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4; ; CHECK-SM100-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3; ; CHECK-SM100-NEXT: add.rn.f32x2 %rd7, %rd6, %rd5; -; CHECK-SM100-NEXT: mov.b64 {_, %r1}, %rd7; -; CHECK-SM100-NEXT: // implicit-def: %r2 -; CHECK-SM100-NEXT: mov.b64 %rd8, {%r1, %r2}; -; CHECK-SM100-NEXT: add.rn.f32x2 %rd9, %rd7, %rd8; -; CHECK-SM100-NEXT: mov.b64 {%r3, _}, %rd9; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd7; +; CHECK-SM100-NEXT: add.rn.f32 %r3, %r1, %r2; ; CHECK-SM100-NEXT: add.rn.f32 %r4, %r3, 0f00000000; ; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-SM100-NEXT: ret; @@ -229,7 +214,6 @@ define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) { ret float %res } -; Check straight line reduction. define half @reduce_fmul_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fmul_half( ; CHECK: { @@ -256,41 +240,20 @@ define half @reduce_fmul_half(<8 x half> %in) { } define half @reduce_fmul_half_reassoc(<8 x half> %in) { -; CHECK-SM80-LABEL: reduce_fmul_half_reassoc( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<10>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_fmul_half_reassoc( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<10>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_fmul_half_reassoc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-NEXT: mul.rn.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: mul.rn.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: mul.rn.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in) ret half %res } @@ -321,29 +284,45 @@ define half @reduce_fmul_half_reassoc_nonpow2(<7 x half> %in) { ret half %res } -; Check straight-line reduction. define float @reduce_fmul_float(<8 x float> %in) { -; CHECK-LABEL: reduce_fmul_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0]; -; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; -; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; -; CHECK-NEXT: mul.rn.f32 %r9, %r7, %r8; -; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r5; -; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r6; -; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r3; -; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r4; -; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r1; -; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmul_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0]; +; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r1, %r2; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r3; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r4; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r11, %r5; +; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r12, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r13, %r7; +; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r8; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmul_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-SM100-NEXT: mul.rn.f32 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mul.rn.f32 %r10, %r9, %r5; +; CHECK-SM100-NEXT: mul.rn.f32 %r11, %r10, %r6; +; CHECK-SM100-NEXT: mul.rn.f32 %r12, %r11, %r3; +; CHECK-SM100-NEXT: mul.rn.f32 %r13, %r12, %r4; +; CHECK-SM100-NEXT: mul.rn.f32 %r14, %r13, %r1; +; CHECK-SM100-NEXT: mul.rn.f32 %r15, %r14, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) ret float %res } @@ -352,29 +331,24 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) { ; CHECK-SM80-LABEL: reduce_fmul_float_reassoc( ; CHECK-SM80: { ; CHECK-SM80-NEXT: .reg .b32 %r<16>; -; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-SM80-NEXT: mul.rn.f32 %r5, %r3, %r1; -; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r8, %r6; -; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r4, %r2; -; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r9, %r7; -; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r12, %r11; -; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r10, %r5; -; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r13; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0]; +; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r11; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-SM80-NEXT: ret; ; ; CHECK-SM100-LABEL: reduce_fmul_float_reassoc( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b32 %r<4>; -; CHECK-SM100-NEXT: .reg .b64 %rd<10>; +; CHECK-SM100-NEXT: .reg .b64 %rd<8>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16]; @@ -382,11 +356,8 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) { ; CHECK-SM100-NEXT: mul.rn.f32x2 %rd5, %rd2, %rd4; ; CHECK-SM100-NEXT: mul.rn.f32x2 %rd6, %rd1, %rd3; ; CHECK-SM100-NEXT: mul.rn.f32x2 %rd7, %rd6, %rd5; -; CHECK-SM100-NEXT: mov.b64 {_, %r1}, %rd7; -; CHECK-SM100-NEXT: // implicit-def: %r2 -; CHECK-SM100-NEXT: mov.b64 %rd8, {%r1, %r2}; -; CHECK-SM100-NEXT: mul.rn.f32x2 %rd9, %rd7, %rd8; -; CHECK-SM100-NEXT: mov.b64 {%r3, _}, %rd9; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd7; +; CHECK-SM100-NEXT: mul.rn.f32 %r3, %r1, %r2; ; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) @@ -436,7 +407,6 @@ define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) { ret float %res } -; Check straight line reduction. define half @reduce_fmax_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fmax_half( ; CHECK: { @@ -501,84 +471,241 @@ define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) { ret half %res } -; Check straight-line reduction. -define float @reduce_fmax_float(<8 x float> %in) { -; -; CHECK-LABEL: reduce_fmax_float( +define half @reduce_fmax_half_nnan(<8 x half> %in) { +; CHECK-LABEL: reduce_fmax_half_nnan( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: max.f32 %r5, %r4, %r2; -; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-NEXT: max.f32 %r10, %r9, %r7; -; CHECK-NEXT: max.f32 %r11, %r10, %r5; -; CHECK-NEXT: max.f32 %r12, %r3, %r1; -; CHECK-NEXT: max.f32 %r13, %r8, %r6; -; CHECK-NEXT: max.f32 %r14, %r13, %r12; -; CHECK-NEXT: max.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_nnan_param_0]; +; CHECK-NEXT: max.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: max.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: max.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: max.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-NEXT: ret; - %res = call float @llvm.vector.reduce.fmax(<8 x float> %in) - ret float %res + %res = call nnan half @llvm.vector.reduce.fmax(<8 x half> %in) + ret half %res } -define float @reduce_fmax_float_reassoc(<8 x float> %in) { -; -; CHECK-LABEL: reduce_fmax_float_reassoc( +define half @reduce_fmax_half_nnan_nonpow2(<7 x half> %in) { +; CHECK-LABEL: reduce_fmax_half_nnan_nonpow2( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b16 %rs<12>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: max.f32 %r5, %r4, %r2; -; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-NEXT: max.f32 %r10, %r9, %r7; -; CHECK-NEXT: max.f32 %r11, %r10, %r5; -; CHECK-NEXT: max.f32 %r12, %r3, %r1; -; CHECK-NEXT: max.f32 %r13, %r8, %r6; -; CHECK-NEXT: max.f32 %r14, %r13, %r12; -; CHECK-NEXT: max.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmax_half_nnan_nonpow2_param_0+8]; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [reduce_fmax_half_nnan_nonpow2_param_0]; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmax_half_nnan_nonpow2_param_0+12]; +; CHECK-NEXT: max.f16x2 %r4, %r2, %r1; +; CHECK-NEXT: mov.b16 %rs8, 0xFC00; +; CHECK-NEXT: mov.b32 %r5, {%rs7, %rs8}; +; CHECK-NEXT: max.f16x2 %r6, %r3, %r5; +; CHECK-NEXT: max.f16x2 %r7, %r4, %r6; +; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7; +; CHECK-NEXT: max.f16 %rs11, %rs9, %rs10; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs11; ; CHECK-NEXT: ret; + %res = call nnan half @llvm.vector.reduce.fmax(<7 x half> %in) + ret half %res +} + +define float @reduce_fmax_float(<8 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmax_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0]; +; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: max.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: max.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmax_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-SM100-NEXT: max.f32 %r9, %r8, %r5, %r6; +; CHECK-SM100-NEXT: max.f32 %r10, %r3, %r4, %r7; +; CHECK-SM100-NEXT: max.f32 %r11, %r10, %r9, %r1; +; CHECK-SM100-NEXT: max.f32 %r12, %r11, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; + %res = call float @llvm.vector.reduce.fmax(<8 x float> %in) + ret float %res +} + +define float @reduce_fmax_float_reassoc(<8 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmax_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0]; +; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: max.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: max.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmax_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-SM100-NEXT: max.f32 %r9, %r8, %r5, %r6; +; CHECK-SM100-NEXT: max.f32 %r10, %r3, %r4, %r7; +; CHECK-SM100-NEXT: max.f32 %r11, %r10, %r9, %r1; +; CHECK-SM100-NEXT: max.f32 %r12, %r11, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmax(<8 x float> %in) ret float %res } define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmax_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: max.f32 %r8, %r5, %r6; +; CHECK-SM80-NEXT: max.f32 %r9, %r8, %r7; +; CHECK-SM80-NEXT: max.f32 %r10, %r3, %r4; +; CHECK-SM80-NEXT: max.f32 %r11, %r1, %r2; +; CHECK-SM80-NEXT: max.f32 %r12, %r11, %r10; +; CHECK-SM80-NEXT: max.f32 %r13, %r12, %r9; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; ; -; CHECK-LABEL: reduce_fmax_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: max.f32 %r8, %r3, %r7; -; CHECK-NEXT: max.f32 %r9, %r1, %r5; -; CHECK-NEXT: max.f32 %r10, %r9, %r8; -; CHECK-NEXT: max.f32 %r11, %r2, %r6; -; CHECK-NEXT: max.f32 %r12, %r11, %r4; -; CHECK-NEXT: max.f32 %r13, %r10, %r12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; -; CHECK-NEXT: ret; +; CHECK-SM100-LABEL: reduce_fmax_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmax_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: max.f32 %r8, %r4, %r5, %r6; +; CHECK-SM100-NEXT: max.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: max.f32 %r10, %r9, %r8, %r7; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmax(<7 x float> %in) ret float %res } -; Check straight line reduction. +define float @reduce_fmax_float_nnan(<8 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmax_float_nnan( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_nnan_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_nnan_param_0]; +; CHECK-SM80-NEXT: max.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: max.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: max.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: max.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: max.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmax_float_nnan( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_nnan_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_nnan_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-SM100-NEXT: max.f32 %r9, %r8, %r5, %r6; +; CHECK-SM100-NEXT: max.f32 %r10, %r3, %r4, %r7; +; CHECK-SM100-NEXT: max.f32 %r11, %r10, %r9, %r1; +; CHECK-SM100-NEXT: max.f32 %r12, %r11, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; + %res = call nnan float @llvm.vector.reduce.fmax(<8 x float> %in) + ret float %res +} + +define float @reduce_fmax_float_nnan_nonpow2(<7 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmax_float_nnan_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmax_float_nnan_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_nnan_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_nnan_nonpow2_param_0]; +; CHECK-SM80-NEXT: max.f32 %r8, %r5, %r6; +; CHECK-SM80-NEXT: max.f32 %r9, %r8, %r7; +; CHECK-SM80-NEXT: max.f32 %r10, %r3, %r4; +; CHECK-SM80-NEXT: max.f32 %r11, %r1, %r2; +; CHECK-SM80-NEXT: max.f32 %r12, %r11, %r10; +; CHECK-SM80-NEXT: max.f32 %r13, %r12, %r9; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmax_float_nnan_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmax_float_nnan_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmax_float_nnan_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_nnan_nonpow2_param_0]; +; CHECK-SM100-NEXT: max.f32 %r8, %r4, %r5, %r6; +; CHECK-SM100-NEXT: max.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: max.f32 %r10, %r9, %r8, %r7; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; + %res = call nnan float @llvm.vector.reduce.fmax(<7 x float> %in) + ret float %res +} + define half @reduce_fmin_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fmin_half( ; CHECK: { @@ -643,84 +770,241 @@ define half @reduce_fmin_half_reassoc_nonpow2(<7 x half> %in) { ret half %res } -; Check straight-line reduction. -define float @reduce_fmin_float(<8 x float> %in) { -; -; CHECK-LABEL: reduce_fmin_float( +define half @reduce_fmin_half_nnan(<8 x half> %in) { +; CHECK-LABEL: reduce_fmin_half_nnan( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: min.f32 %r5, %r4, %r2; -; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-NEXT: min.f32 %r10, %r9, %r7; -; CHECK-NEXT: min.f32 %r11, %r10, %r5; -; CHECK-NEXT: min.f32 %r12, %r3, %r1; -; CHECK-NEXT: min.f32 %r13, %r8, %r6; -; CHECK-NEXT: min.f32 %r14, %r13, %r12; -; CHECK-NEXT: min.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_nnan_param_0]; +; CHECK-NEXT: min.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: min.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: min.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: min.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-NEXT: ret; - %res = call float @llvm.vector.reduce.fmin(<8 x float> %in) - ret float %res + %res = call nnan half @llvm.vector.reduce.fmin(<8 x half> %in) + ret half %res } -define float @reduce_fmin_float_reassoc(<8 x float> %in) { -; -; CHECK-LABEL: reduce_fmin_float_reassoc( +define half @reduce_fmin_half_nnan_nonpow2(<7 x half> %in) { +; CHECK-LABEL: reduce_fmin_half_nnan_nonpow2( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b16 %rs<12>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: min.f32 %r5, %r4, %r2; -; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-NEXT: min.f32 %r10, %r9, %r7; -; CHECK-NEXT: min.f32 %r11, %r10, %r5; -; CHECK-NEXT: min.f32 %r12, %r3, %r1; -; CHECK-NEXT: min.f32 %r13, %r8, %r6; -; CHECK-NEXT: min.f32 %r14, %r13, %r12; -; CHECK-NEXT: min.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-NEXT: ld.param.b32 %r1, [reduce_fmin_half_nnan_nonpow2_param_0+8]; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [reduce_fmin_half_nnan_nonpow2_param_0]; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: ld.param.b16 %rs7, [reduce_fmin_half_nnan_nonpow2_param_0+12]; +; CHECK-NEXT: min.f16x2 %r4, %r2, %r1; +; CHECK-NEXT: mov.b16 %rs8, 0x7C00; +; CHECK-NEXT: mov.b32 %r5, {%rs7, %rs8}; +; CHECK-NEXT: min.f16x2 %r6, %r3, %r5; +; CHECK-NEXT: min.f16x2 %r7, %r4, %r6; +; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r7; +; CHECK-NEXT: min.f16 %rs11, %rs9, %rs10; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs11; ; CHECK-NEXT: ret; + %res = call nnan half @llvm.vector.reduce.fmin(<7 x half> %in) + ret half %res +} + +define float @reduce_fmin_float(<8 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmin_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0]; +; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: min.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: min.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmin_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-SM100-NEXT: min.f32 %r9, %r8, %r5, %r6; +; CHECK-SM100-NEXT: min.f32 %r10, %r3, %r4, %r7; +; CHECK-SM100-NEXT: min.f32 %r11, %r10, %r9, %r1; +; CHECK-SM100-NEXT: min.f32 %r12, %r11, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; + %res = call float @llvm.vector.reduce.fmin(<8 x float> %in) + ret float %res +} + +define float @reduce_fmin_float_reassoc(<8 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmin_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0]; +; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: min.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: min.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmin_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-SM100-NEXT: min.f32 %r9, %r8, %r5, %r6; +; CHECK-SM100-NEXT: min.f32 %r10, %r3, %r4, %r7; +; CHECK-SM100-NEXT: min.f32 %r11, %r10, %r9, %r1; +; CHECK-SM100-NEXT: min.f32 %r12, %r11, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmin(<8 x float> %in) ret float %res } define float @reduce_fmin_float_reassoc_nonpow2(<7 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmin_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: min.f32 %r8, %r5, %r6; +; CHECK-SM80-NEXT: min.f32 %r9, %r8, %r7; +; CHECK-SM80-NEXT: min.f32 %r10, %r3, %r4; +; CHECK-SM80-NEXT: min.f32 %r11, %r1, %r2; +; CHECK-SM80-NEXT: min.f32 %r12, %r11, %r10; +; CHECK-SM80-NEXT: min.f32 %r13, %r12, %r9; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; ; -; CHECK-LABEL: reduce_fmin_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: min.f32 %r8, %r3, %r7; -; CHECK-NEXT: min.f32 %r9, %r1, %r5; -; CHECK-NEXT: min.f32 %r10, %r9, %r8; -; CHECK-NEXT: min.f32 %r11, %r2, %r6; -; CHECK-NEXT: min.f32 %r12, %r11, %r4; -; CHECK-NEXT: min.f32 %r13, %r10, %r12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; -; CHECK-NEXT: ret; +; CHECK-SM100-LABEL: reduce_fmin_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmin_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: min.f32 %r8, %r4, %r5, %r6; +; CHECK-SM100-NEXT: min.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: min.f32 %r10, %r9, %r8, %r7; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmin(<7 x float> %in) ret float %res } -; Check straight-line reduction. +define float @reduce_fmin_float_nnan(<8 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmin_float_nnan( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_nnan_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_nnan_param_0]; +; CHECK-SM80-NEXT: min.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: min.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: min.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: min.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: min.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmin_float_nnan( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_nnan_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_nnan_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-SM100-NEXT: min.f32 %r9, %r8, %r5, %r6; +; CHECK-SM100-NEXT: min.f32 %r10, %r3, %r4, %r7; +; CHECK-SM100-NEXT: min.f32 %r11, %r10, %r9, %r1; +; CHECK-SM100-NEXT: min.f32 %r12, %r11, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; + %res = call nnan float @llvm.vector.reduce.fmin(<8 x float> %in) + ret float %res +} + +define float @reduce_fmin_float_nnan_nonpow2(<7 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmin_float_nnan_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmin_float_nnan_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_nnan_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_nnan_nonpow2_param_0]; +; CHECK-SM80-NEXT: min.f32 %r8, %r5, %r6; +; CHECK-SM80-NEXT: min.f32 %r9, %r8, %r7; +; CHECK-SM80-NEXT: min.f32 %r10, %r3, %r4; +; CHECK-SM80-NEXT: min.f32 %r11, %r1, %r2; +; CHECK-SM80-NEXT: min.f32 %r12, %r11, %r10; +; CHECK-SM80-NEXT: min.f32 %r13, %r12, %r9; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmin_float_nnan_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmin_float_nnan_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmin_float_nnan_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_nnan_nonpow2_param_0]; +; CHECK-SM100-NEXT: min.f32 %r8, %r4, %r5, %r6; +; CHECK-SM100-NEXT: min.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: min.f32 %r10, %r9, %r8, %r7; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; + %res = call nnan float @llvm.vector.reduce.fmin(<7 x float> %in) + ret float %res +} + define half @reduce_fmaximum_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fmaximum_half( ; CHECK: { @@ -785,84 +1069,121 @@ define half @reduce_fmaximum_half_reassoc_nonpow2(<7 x half> %in) { ret half %res } -; Check straight-line reduction. define float @reduce_fmaximum_float(<8 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmaximum_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0]; +; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: max.NaN.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.NaN.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; ; -; CHECK-LABEL: reduce_fmaximum_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: max.NaN.f32 %r5, %r4, %r2; -; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r7; -; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r5; -; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r1; -; CHECK-NEXT: max.NaN.f32 %r13, %r8, %r6; -; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; -; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM100-LABEL: reduce_fmaximum_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r8, %r5, %r6; +; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r3, %r4, %r7; +; CHECK-SM100-NEXT: max.NaN.f32 %r11, %r10, %r9, %r1; +; CHECK-SM100-NEXT: max.NaN.f32 %r12, %r11, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fmaximum(<8 x float> %in) ret float %res } define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0]; +; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: max.NaN.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.NaN.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; ; -; CHECK-LABEL: reduce_fmaximum_float_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: max.NaN.f32 %r5, %r4, %r2; -; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r7; -; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r5; -; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r1; -; CHECK-NEXT: max.NaN.f32 %r13, %r8, %r6; -; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; -; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM100-LABEL: reduce_fmaximum_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r8, %r5, %r6; +; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r3, %r4, %r7; +; CHECK-SM100-NEXT: max.NaN.f32 %r11, %r10, %r9, %r1; +; CHECK-SM100-NEXT: max.NaN.f32 %r12, %r11, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmaximum(<8 x float> %in) ret float %res } define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) { +; CHECK-SM80-LABEL: reduce_fmaximum_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: max.NaN.f32 %r8, %r5, %r6; +; CHECK-SM80-NEXT: max.NaN.f32 %r9, %r8, %r7; +; CHECK-SM80-NEXT: max.NaN.f32 %r10, %r3, %r4; +; CHECK-SM80-NEXT: max.NaN.f32 %r11, %r1, %r2; +; CHECK-SM80-NEXT: max.NaN.f32 %r12, %r11, %r10; +; CHECK-SM80-NEXT: max.NaN.f32 %r13, %r12, %r9; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; ; -; CHECK-LABEL: reduce_fmaximum_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: max.NaN.f32 %r8, %r3, %r7; -; CHECK-NEXT: max.NaN.f32 %r9, %r1, %r5; -; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r8; -; CHECK-NEXT: max.NaN.f32 %r11, %r2, %r6; -; CHECK-NEXT: max.NaN.f32 %r12, %r11, %r4; -; CHECK-NEXT: max.NaN.f32 %r13, %r10, %r12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; -; CHECK-NEXT: ret; +; CHECK-SM100-LABEL: reduce_fmaximum_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: max.NaN.f32 %r8, %r4, %r5, %r6; +; CHECK-SM100-NEXT: max.NaN.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: max.NaN.f32 %r10, %r9, %r8, %r7; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmaximum(<7 x float> %in) ret float %res } -; Check straight-line reduction. define half @reduce_fminimum_half(<8 x half> %in) { ; CHECK-LABEL: reduce_fminimum_half( ; CHECK: { @@ -927,79 +1248,117 @@ define half @reduce_fminimum_half_reassoc_nonpow2(<7 x half> %in) { ret half %res } -; Check straight-line reduction. define float @reduce_fminimum_float(<8 x float> %in) { +; CHECK-SM80-LABEL: reduce_fminimum_float( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0]; +; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: min.NaN.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.NaN.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; ; -; CHECK-LABEL: reduce_fminimum_float( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: min.NaN.f32 %r5, %r4, %r2; -; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r7; -; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r5; -; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r1; -; CHECK-NEXT: min.NaN.f32 %r13, %r8, %r6; -; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; -; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM100-LABEL: reduce_fminimum_float( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r8, %r5, %r6; +; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r3, %r4, %r7; +; CHECK-SM100-NEXT: min.NaN.f32 %r11, %r10, %r9, %r1; +; CHECK-SM100-NEXT: min.NaN.f32 %r12, %r11, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call float @llvm.vector.reduce.fminimum(<8 x float> %in) ret float %res } define float @reduce_fminimum_float_reassoc(<8 x float> %in) { +; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0]; +; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r5, %r6; +; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r3, %r4; +; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r1, %r2; +; CHECK-SM80-NEXT: min.NaN.f32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.NaN.f32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; ; -; CHECK-LABEL: reduce_fminimum_float_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0]; -; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; -; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; -; CHECK-NEXT: min.NaN.f32 %r5, %r4, %r2; -; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; -; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; -; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r7; -; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r5; -; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r1; -; CHECK-NEXT: min.NaN.f32 %r13, %r8, %r6; -; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; -; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM100-LABEL: reduce_fminimum_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r8, %r5, %r6; +; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r3, %r4, %r7; +; CHECK-SM100-NEXT: min.NaN.f32 %r11, %r10, %r9, %r1; +; CHECK-SM100-NEXT: min.NaN.f32 %r12, %r11, %r2; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fminimum(<8 x float> %in) ret float %res } define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) { +; CHECK-SM80-LABEL: reduce_fminimum_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: min.NaN.f32 %r8, %r5, %r6; +; CHECK-SM80-NEXT: min.NaN.f32 %r9, %r8, %r7; +; CHECK-SM80-NEXT: min.NaN.f32 %r10, %r3, %r4; +; CHECK-SM80-NEXT: min.NaN.f32 %r11, %r1, %r2; +; CHECK-SM80-NEXT: min.NaN.f32 %r12, %r11, %r10; +; CHECK-SM80-NEXT: min.NaN.f32 %r13, %r12, %r9; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; ; -; CHECK-LABEL: reduce_fminimum_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: min.NaN.f32 %r8, %r3, %r7; -; CHECK-NEXT: min.NaN.f32 %r9, %r1, %r5; -; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r8; -; CHECK-NEXT: min.NaN.f32 %r11, %r2, %r6; -; CHECK-NEXT: min.NaN.f32 %r12, %r11, %r4; -; CHECK-NEXT: min.NaN.f32 %r13, %r10, %r12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; -; CHECK-NEXT: ret; +; CHECK-SM100-LABEL: reduce_fminimum_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: min.NaN.f32 %r8, %r4, %r5, %r6; +; CHECK-SM100-NEXT: min.NaN.f32 %r9, %r1, %r2, %r3; +; CHECK-SM100-NEXT: min.NaN.f32 %r10, %r9, %r8, %r7; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fminimum(<7 x float> %in) ret float %res } @@ -1014,15 +1373,15 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-SM80-NEXT: add.s16 %rs5, %rs3, %rs1; +; CHECK-SM80-NEXT: add.s16 %rs5, %rs4, %rs2; ; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-SM80-NEXT: add.s16 %rs10, %rs8, %rs6; -; CHECK-SM80-NEXT: add.s16 %rs11, %rs4, %rs2; -; CHECK-SM80-NEXT: add.s16 %rs12, %rs9, %rs7; -; CHECK-SM80-NEXT: add.s16 %rs13, %rs12, %rs11; -; CHECK-SM80-NEXT: add.s16 %rs14, %rs10, %rs5; -; CHECK-SM80-NEXT: add.s16 %rs15, %rs14, %rs13; +; CHECK-SM80-NEXT: add.s16 %rs10, %rs9, %rs7; +; CHECK-SM80-NEXT: add.s16 %rs11, %rs10, %rs5; +; CHECK-SM80-NEXT: add.s16 %rs12, %rs3, %rs1; +; CHECK-SM80-NEXT: add.s16 %rs13, %rs8, %rs6; +; CHECK-SM80-NEXT: add.s16 %rs14, %rs13, %rs12; +; CHECK-SM80-NEXT: add.s16 %rs15, %rs14, %rs11; ; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-SM80-NEXT: ret; @@ -1030,20 +1389,17 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_add_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM100-NEXT: add.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: add.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: add.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: add.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.add(<8 x i16> %in) ret i16 %res @@ -1103,13 +1459,13 @@ define i32 @reduce_add_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0]; -; CHECK-NEXT: add.s32 %r9, %r3, %r7; -; CHECK-NEXT: add.s32 %r10, %r1, %r5; -; CHECK-NEXT: add.s32 %r11, %r4, %r8; -; CHECK-NEXT: add.s32 %r12, %r2, %r6; -; CHECK-NEXT: add.s32 %r13, %r12, %r11; -; CHECK-NEXT: add.s32 %r14, %r10, %r9; -; CHECK-NEXT: add.s32 %r15, %r14, %r13; +; CHECK-NEXT: add.s32 %r9, %r4, %r8; +; CHECK-NEXT: add.s32 %r10, %r2, %r6; +; CHECK-NEXT: add.s32 %r11, %r10, %r9; +; CHECK-NEXT: add.s32 %r12, %r3, %r7; +; CHECK-NEXT: add.s32 %r13, %r1, %r5; +; CHECK-NEXT: add.s32 %r14, %r13, %r12; +; CHECK-NEXT: add.s32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.add(<8 x i32> %in) @@ -1147,15 +1503,15 @@ define i16 @reduce_mul_i16(<8 x i16> %in) { ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i16_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-NEXT: mul.lo.s16 %rs5, %rs3, %rs1; +; CHECK-NEXT: mul.lo.s16 %rs5, %rs4, %rs2; ; CHECK-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-NEXT: mul.lo.s16 %rs10, %rs8, %rs6; -; CHECK-NEXT: mul.lo.s16 %rs11, %rs4, %rs2; -; CHECK-NEXT: mul.lo.s16 %rs12, %rs9, %rs7; -; CHECK-NEXT: mul.lo.s16 %rs13, %rs12, %rs11; -; CHECK-NEXT: mul.lo.s16 %rs14, %rs10, %rs5; -; CHECK-NEXT: mul.lo.s16 %rs15, %rs14, %rs13; +; CHECK-NEXT: mul.lo.s16 %rs10, %rs9, %rs7; +; CHECK-NEXT: mul.lo.s16 %rs11, %rs10, %rs5; +; CHECK-NEXT: mul.lo.s16 %rs12, %rs3, %rs1; +; CHECK-NEXT: mul.lo.s16 %rs13, %rs8, %rs6; +; CHECK-NEXT: mul.lo.s16 %rs14, %rs13, %rs12; +; CHECK-NEXT: mul.lo.s16 %rs15, %rs14, %rs11; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; @@ -1194,13 +1550,13 @@ define i32 @reduce_mul_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0]; -; CHECK-NEXT: mul.lo.s32 %r9, %r3, %r7; -; CHECK-NEXT: mul.lo.s32 %r10, %r1, %r5; -; CHECK-NEXT: mul.lo.s32 %r11, %r4, %r8; -; CHECK-NEXT: mul.lo.s32 %r12, %r2, %r6; -; CHECK-NEXT: mul.lo.s32 %r13, %r12, %r11; -; CHECK-NEXT: mul.lo.s32 %r14, %r10, %r9; -; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r13; +; CHECK-NEXT: mul.lo.s32 %r9, %r4, %r8; +; CHECK-NEXT: mul.lo.s32 %r10, %r2, %r6; +; CHECK-NEXT: mul.lo.s32 %r11, %r10, %r9; +; CHECK-NEXT: mul.lo.s32 %r12, %r3, %r7; +; CHECK-NEXT: mul.lo.s32 %r13, %r1, %r5; +; CHECK-NEXT: mul.lo.s32 %r14, %r13, %r12; +; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.mul(<8 x i32> %in) @@ -1238,15 +1594,15 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-SM80-NEXT: max.u16 %rs5, %rs3, %rs1; +; CHECK-SM80-NEXT: max.u16 %rs5, %rs4, %rs2; ; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-SM80-NEXT: max.u16 %rs10, %rs8, %rs6; -; CHECK-SM80-NEXT: max.u16 %rs11, %rs4, %rs2; -; CHECK-SM80-NEXT: max.u16 %rs12, %rs9, %rs7; -; CHECK-SM80-NEXT: max.u16 %rs13, %rs12, %rs11; -; CHECK-SM80-NEXT: max.u16 %rs14, %rs10, %rs5; -; CHECK-SM80-NEXT: max.u16 %rs15, %rs14, %rs13; +; CHECK-SM80-NEXT: max.u16 %rs10, %rs9, %rs7; +; CHECK-SM80-NEXT: max.u16 %rs11, %rs10, %rs5; +; CHECK-SM80-NEXT: max.u16 %rs12, %rs3, %rs1; +; CHECK-SM80-NEXT: max.u16 %rs13, %rs8, %rs6; +; CHECK-SM80-NEXT: max.u16 %rs14, %rs13, %rs12; +; CHECK-SM80-NEXT: max.u16 %rs15, %rs14, %rs11; ; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-SM80-NEXT: ret; @@ -1254,20 +1610,17 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM100-NEXT: max.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: max.u16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: max.u16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umax(<8 x i16> %in) ret i16 %res @@ -1327,13 +1680,13 @@ define i32 @reduce_umax_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0]; -; CHECK-NEXT: max.u32 %r9, %r3, %r7; -; CHECK-NEXT: max.u32 %r10, %r1, %r5; -; CHECK-NEXT: max.u32 %r11, %r4, %r8; -; CHECK-NEXT: max.u32 %r12, %r2, %r6; -; CHECK-NEXT: max.u32 %r13, %r12, %r11; -; CHECK-NEXT: max.u32 %r14, %r10, %r9; -; CHECK-NEXT: max.u32 %r15, %r14, %r13; +; CHECK-NEXT: max.u32 %r9, %r4, %r8; +; CHECK-NEXT: max.u32 %r10, %r2, %r6; +; CHECK-NEXT: max.u32 %r11, %r10, %r9; +; CHECK-NEXT: max.u32 %r12, %r3, %r7; +; CHECK-NEXT: max.u32 %r13, %r1, %r5; +; CHECK-NEXT: max.u32 %r14, %r13, %r12; +; CHECK-NEXT: max.u32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.umax(<8 x i32> %in) @@ -1371,15 +1724,15 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-SM80-NEXT: min.u16 %rs5, %rs3, %rs1; +; CHECK-SM80-NEXT: min.u16 %rs5, %rs4, %rs2; ; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-SM80-NEXT: min.u16 %rs10, %rs8, %rs6; -; CHECK-SM80-NEXT: min.u16 %rs11, %rs4, %rs2; -; CHECK-SM80-NEXT: min.u16 %rs12, %rs9, %rs7; -; CHECK-SM80-NEXT: min.u16 %rs13, %rs12, %rs11; -; CHECK-SM80-NEXT: min.u16 %rs14, %rs10, %rs5; -; CHECK-SM80-NEXT: min.u16 %rs15, %rs14, %rs13; +; CHECK-SM80-NEXT: min.u16 %rs10, %rs9, %rs7; +; CHECK-SM80-NEXT: min.u16 %rs11, %rs10, %rs5; +; CHECK-SM80-NEXT: min.u16 %rs12, %rs3, %rs1; +; CHECK-SM80-NEXT: min.u16 %rs13, %rs8, %rs6; +; CHECK-SM80-NEXT: min.u16 %rs14, %rs13, %rs12; +; CHECK-SM80-NEXT: min.u16 %rs15, %rs14, %rs11; ; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-SM80-NEXT: ret; @@ -1387,20 +1740,17 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM100-NEXT: min.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: min.u16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: min.u16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umin(<8 x i16> %in) ret i16 %res @@ -1460,13 +1810,13 @@ define i32 @reduce_umin_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0]; -; CHECK-NEXT: min.u32 %r9, %r3, %r7; -; CHECK-NEXT: min.u32 %r10, %r1, %r5; -; CHECK-NEXT: min.u32 %r11, %r4, %r8; -; CHECK-NEXT: min.u32 %r12, %r2, %r6; -; CHECK-NEXT: min.u32 %r13, %r12, %r11; -; CHECK-NEXT: min.u32 %r14, %r10, %r9; -; CHECK-NEXT: min.u32 %r15, %r14, %r13; +; CHECK-NEXT: min.u32 %r9, %r4, %r8; +; CHECK-NEXT: min.u32 %r10, %r2, %r6; +; CHECK-NEXT: min.u32 %r11, %r10, %r9; +; CHECK-NEXT: min.u32 %r12, %r3, %r7; +; CHECK-NEXT: min.u32 %r13, %r1, %r5; +; CHECK-NEXT: min.u32 %r14, %r13, %r12; +; CHECK-NEXT: min.u32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.umin(<8 x i32> %in) @@ -1504,15 +1854,15 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-SM80-NEXT: max.s16 %rs5, %rs3, %rs1; +; CHECK-SM80-NEXT: max.s16 %rs5, %rs4, %rs2; ; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-SM80-NEXT: max.s16 %rs10, %rs8, %rs6; -; CHECK-SM80-NEXT: max.s16 %rs11, %rs4, %rs2; -; CHECK-SM80-NEXT: max.s16 %rs12, %rs9, %rs7; -; CHECK-SM80-NEXT: max.s16 %rs13, %rs12, %rs11; -; CHECK-SM80-NEXT: max.s16 %rs14, %rs10, %rs5; -; CHECK-SM80-NEXT: max.s16 %rs15, %rs14, %rs13; +; CHECK-SM80-NEXT: max.s16 %rs10, %rs9, %rs7; +; CHECK-SM80-NEXT: max.s16 %rs11, %rs10, %rs5; +; CHECK-SM80-NEXT: max.s16 %rs12, %rs3, %rs1; +; CHECK-SM80-NEXT: max.s16 %rs13, %rs8, %rs6; +; CHECK-SM80-NEXT: max.s16 %rs14, %rs13, %rs12; +; CHECK-SM80-NEXT: max.s16 %rs15, %rs14, %rs11; ; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-SM80-NEXT: ret; @@ -1520,20 +1870,17 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM100-NEXT: max.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: max.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: max.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smax(<8 x i16> %in) ret i16 %res @@ -1593,13 +1940,13 @@ define i32 @reduce_smax_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0]; -; CHECK-NEXT: max.s32 %r9, %r3, %r7; -; CHECK-NEXT: max.s32 %r10, %r1, %r5; -; CHECK-NEXT: max.s32 %r11, %r4, %r8; -; CHECK-NEXT: max.s32 %r12, %r2, %r6; -; CHECK-NEXT: max.s32 %r13, %r12, %r11; -; CHECK-NEXT: max.s32 %r14, %r10, %r9; -; CHECK-NEXT: max.s32 %r15, %r14, %r13; +; CHECK-NEXT: max.s32 %r9, %r4, %r8; +; CHECK-NEXT: max.s32 %r10, %r2, %r6; +; CHECK-NEXT: max.s32 %r11, %r10, %r9; +; CHECK-NEXT: max.s32 %r12, %r3, %r7; +; CHECK-NEXT: max.s32 %r13, %r1, %r5; +; CHECK-NEXT: max.s32 %r14, %r13, %r12; +; CHECK-NEXT: max.s32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.smax(<8 x i32> %in) @@ -1637,15 +1984,15 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-SM80-NEXT: min.s16 %rs5, %rs3, %rs1; +; CHECK-SM80-NEXT: min.s16 %rs5, %rs4, %rs2; ; CHECK-SM80-NEXT: mov.b32 {%rs6, %rs7}, %r3; ; CHECK-SM80-NEXT: mov.b32 {%rs8, %rs9}, %r1; -; CHECK-SM80-NEXT: min.s16 %rs10, %rs8, %rs6; -; CHECK-SM80-NEXT: min.s16 %rs11, %rs4, %rs2; -; CHECK-SM80-NEXT: min.s16 %rs12, %rs9, %rs7; -; CHECK-SM80-NEXT: min.s16 %rs13, %rs12, %rs11; -; CHECK-SM80-NEXT: min.s16 %rs14, %rs10, %rs5; -; CHECK-SM80-NEXT: min.s16 %rs15, %rs14, %rs13; +; CHECK-SM80-NEXT: min.s16 %rs10, %rs9, %rs7; +; CHECK-SM80-NEXT: min.s16 %rs11, %rs10, %rs5; +; CHECK-SM80-NEXT: min.s16 %rs12, %rs3, %rs1; +; CHECK-SM80-NEXT: min.s16 %rs13, %rs8, %rs6; +; CHECK-SM80-NEXT: min.s16 %rs14, %rs13, %rs12; +; CHECK-SM80-NEXT: min.s16 %rs15, %rs14, %rs11; ; CHECK-SM80-NEXT: cvt.u32.u16 %r5, %rs15; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-SM80-NEXT: ret; @@ -1653,20 +2000,17 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM100-NEXT: min.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: min.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: min.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in) ret i16 %res @@ -1726,13 +2070,13 @@ define i32 @reduce_smin_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0]; -; CHECK-NEXT: min.s32 %r9, %r3, %r7; -; CHECK-NEXT: min.s32 %r10, %r1, %r5; -; CHECK-NEXT: min.s32 %r11, %r4, %r8; -; CHECK-NEXT: min.s32 %r12, %r2, %r6; -; CHECK-NEXT: min.s32 %r13, %r12, %r11; -; CHECK-NEXT: min.s32 %r14, %r10, %r9; -; CHECK-NEXT: min.s32 %r15, %r14, %r13; +; CHECK-NEXT: min.s32 %r9, %r4, %r8; +; CHECK-NEXT: min.s32 %r10, %r2, %r6; +; CHECK-NEXT: min.s32 %r11, %r10, %r9; +; CHECK-NEXT: min.s32 %r12, %r3, %r7; +; CHECK-NEXT: min.s32 %r13, %r1, %r5; +; CHECK-NEXT: min.s32 %r14, %r13, %r12; +; CHECK-NEXT: min.s32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.smin(<8 x i32> %in) @@ -1761,43 +2105,21 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_and_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_and_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-SM80-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: and.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_and_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-SM100-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: and.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_and_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-NEXT: and.b32 %r5, %r2, %r4; +; CHECK-NEXT: and.b32 %r6, %r1, %r3; +; CHECK-NEXT: and.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.and(<8 x i16> %in) ret i16 %res } @@ -1837,13 +2159,13 @@ define i32 @reduce_and_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0]; -; CHECK-NEXT: and.b32 %r9, %r3, %r7; -; CHECK-NEXT: and.b32 %r10, %r1, %r5; -; CHECK-NEXT: and.b32 %r11, %r4, %r8; -; CHECK-NEXT: and.b32 %r12, %r2, %r6; -; CHECK-NEXT: and.b32 %r13, %r12, %r11; -; CHECK-NEXT: and.b32 %r14, %r10, %r9; -; CHECK-NEXT: and.b32 %r15, %r14, %r13; +; CHECK-NEXT: and.b32 %r9, %r4, %r8; +; CHECK-NEXT: and.b32 %r10, %r2, %r6; +; CHECK-NEXT: and.b32 %r11, %r10, %r9; +; CHECK-NEXT: and.b32 %r12, %r3, %r7; +; CHECK-NEXT: and.b32 %r13, %r1, %r5; +; CHECK-NEXT: and.b32 %r14, %r13, %r12; +; CHECK-NEXT: and.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.and(<8 x i32> %in) @@ -1872,43 +2194,21 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_or_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_or_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-SM80-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: or.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_or_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-SM100-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: or.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_or_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-NEXT: or.b32 %r5, %r2, %r4; +; CHECK-NEXT: or.b32 %r6, %r1, %r3; +; CHECK-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.or(<8 x i16> %in) ret i16 %res } @@ -1948,13 +2248,13 @@ define i32 @reduce_or_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0]; -; CHECK-NEXT: or.b32 %r9, %r3, %r7; -; CHECK-NEXT: or.b32 %r10, %r1, %r5; -; CHECK-NEXT: or.b32 %r11, %r4, %r8; -; CHECK-NEXT: or.b32 %r12, %r2, %r6; -; CHECK-NEXT: or.b32 %r13, %r12, %r11; -; CHECK-NEXT: or.b32 %r14, %r10, %r9; -; CHECK-NEXT: or.b32 %r15, %r14, %r13; +; CHECK-NEXT: or.b32 %r9, %r4, %r8; +; CHECK-NEXT: or.b32 %r10, %r2, %r6; +; CHECK-NEXT: or.b32 %r11, %r10, %r9; +; CHECK-NEXT: or.b32 %r12, %r3, %r7; +; CHECK-NEXT: or.b32 %r13, %r1, %r5; +; CHECK-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-NEXT: or.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.or(<8 x i32> %in) @@ -1983,43 +2283,21 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_xor_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_xor_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-SM80-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: xor.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_xor_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-SM100-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: xor.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_xor_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-NEXT: xor.b32 %r5, %r2, %r4; +; CHECK-NEXT: xor.b32 %r6, %r1, %r3; +; CHECK-NEXT: xor.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.xor(<8 x i16> %in) ret i16 %res } @@ -2059,13 +2337,13 @@ define i32 @reduce_xor_i32(<8 x i32> %in) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0]; -; CHECK-NEXT: xor.b32 %r9, %r3, %r7; -; CHECK-NEXT: xor.b32 %r10, %r1, %r5; -; CHECK-NEXT: xor.b32 %r11, %r4, %r8; -; CHECK-NEXT: xor.b32 %r12, %r2, %r6; -; CHECK-NEXT: xor.b32 %r13, %r12, %r11; -; CHECK-NEXT: xor.b32 %r14, %r10, %r9; -; CHECK-NEXT: xor.b32 %r15, %r14, %r13; +; CHECK-NEXT: xor.b32 %r9, %r4, %r8; +; CHECK-NEXT: xor.b32 %r10, %r2, %r6; +; CHECK-NEXT: xor.b32 %r11, %r10, %r9; +; CHECK-NEXT: xor.b32 %r12, %r3, %r7; +; CHECK-NEXT: xor.b32 %r13, %r1, %r5; +; CHECK-NEXT: xor.b32 %r14, %r13, %r12; +; CHECK-NEXT: xor.b32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call i32 @llvm.vector.reduce.xor(<8 x i32> %in) |