; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc < %s -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} target triple = "nvptx64-nvidia-cuda" ; Test dynamic insertelt at the beginning of a chain define <4 x i32> @dynamic_at_beginning(i32 %idx) { ; CHECK-LABEL: dynamic_at_beginning( ; CHECK: { ; CHECK-NEXT: .local .align 4 .b8 __local_depot0[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot0; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_at_beginning_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; ; CHECK-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; ; CHECK-NEXT: st.b32 [%rd5], 10; ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; ; CHECK-NEXT: ld.b32 %r2, [%SP]; ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r2, 20, 30, %r1}; ; CHECK-NEXT: ret; %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 %v2 = insertelement <4 x i32> %v1, i32 30, i32 2 ret <4 x i32> %v2 } ; Test dynamic insertelt at the end of a chain define <4 x i32> @dynamic_at_end(i32 %idx) { ; CHECK-LABEL: dynamic_at_end( ; CHECK: { ; CHECK-NEXT: .local .align 4 .b8 __local_depot1[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot1; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_at_end_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; ; CHECK-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; ; CHECK-NEXT: st.b32 [%SP+4], 20; ; CHECK-NEXT: st.b32 [%SP], 10; ; CHECK-NEXT: st.b32 [%rd5], 30; ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; ; CHECK-NEXT: ld.b32 %r2, [%SP+8]; ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; ; CHECK-NEXT: ld.b32 %r4, [%SP]; ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; ; CHECK-NEXT: ret; %v0 = insertelement <4 x i32> poison, i32 10, i32 0 %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx ret <4 x i32> %v2 } ; Test dynamic insertelt in the middle of a chain define <4 x i32> @dynamic_in_middle(i32 %idx) { ; CHECK-LABEL: dynamic_in_middle( ; CHECK: { ; CHECK-NEXT: .local .align 4 .b8 __local_depot2[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot2; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_in_middle_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; ; CHECK-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; ; CHECK-NEXT: st.b32 [%SP], 10; ; CHECK-NEXT: st.b32 [%rd5], 20; ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; ; CHECK-NEXT: ld.b32 %r2, [%SP+4]; ; CHECK-NEXT: ld.b32 %r3, [%SP]; ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r3, %r2, 30, %r1}; ; CHECK-NEXT: ret; %v0 = insertelement <4 x i32> poison, i32 10, i32 0 %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx %v2 = insertelement <4 x i32> %v1, i32 30, i32 2 ret <4 x i32> %v2 } ; Test repeated dynamic insertelt with the same index define <4 x i32> @repeated_same_index(i32 %idx) { ; CHECK-LABEL: repeated_same_index( ; CHECK: { ; CHECK-NEXT: .local .align 4 .b8 __local_depot3[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot3; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [repeated_same_index_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; ; CHECK-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; ; CHECK-NEXT: st.b32 [%rd5], 20; ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; ; CHECK-NEXT: ld.b32 %r2, [%SP+8]; ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; ; CHECK-NEXT: ld.b32 %r4, [%SP]; ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; ; CHECK-NEXT: ret; %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx ret <4 x i32> %v1 } ; Test multiple dynamic insertelts define <4 x i32> @multiple_dynamic(i32 %idx0, i32 %idx1) { ; CHECK-LABEL: multiple_dynamic( ; CHECK: { ; CHECK-NEXT: .local .align 4 .b8 __local_depot4[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .b64 %rd<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot4; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [multiple_dynamic_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; ; CHECK-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; ; CHECK-NEXT: st.b32 [%rd5], 10; ; CHECK-NEXT: ld.param.b32 %rd6, [multiple_dynamic_param_1]; ; CHECK-NEXT: and.b64 %rd7, %rd6, 3; ; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; ; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8; ; CHECK-NEXT: st.b32 [%rd9], 20; ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; ; CHECK-NEXT: ld.b32 %r2, [%SP+8]; ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; ; CHECK-NEXT: ld.b32 %r4, [%SP]; ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; ; CHECK-NEXT: ret; %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx1 ret <4 x i32> %v1 } ; Test chain with all dynamic insertelts define <4 x i32> @all_dynamic(i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) { ; CHECK-LABEL: all_dynamic( ; CHECK: { ; CHECK-NEXT: .local .align 4 .b8 __local_depot5[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .b64 %rd<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot5; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [all_dynamic_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; ; CHECK-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; ; CHECK-NEXT: ld.param.b32 %rd6, [all_dynamic_param_1]; ; CHECK-NEXT: and.b64 %rd7, %rd6, 3; ; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; ; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8; ; CHECK-NEXT: ld.param.b32 %rd10, [all_dynamic_param_2]; ; CHECK-NEXT: and.b64 %rd11, %rd10, 3; ; CHECK-NEXT: shl.b64 %rd12, %rd11, 2; ; CHECK-NEXT: add.s64 %rd13, %rd4, %rd12; ; CHECK-NEXT: st.b32 [%rd5], 10; ; CHECK-NEXT: st.b32 [%rd9], 20; ; CHECK-NEXT: st.b32 [%rd13], 30; ; CHECK-NEXT: ld.param.b32 %rd14, [all_dynamic_param_3]; ; CHECK-NEXT: and.b64 %rd15, %rd14, 3; ; CHECK-NEXT: shl.b64 %rd16, %rd15, 2; ; CHECK-NEXT: add.s64 %rd17, %rd4, %rd16; ; CHECK-NEXT: st.b32 [%rd17], 40; ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; ; CHECK-NEXT: ld.b32 %r2, [%SP+8]; ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; ; CHECK-NEXT: ld.b32 %r4, [%SP]; ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; ; CHECK-NEXT: ret; %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 %v1 = insertelement <4 x i32> %v0, i32 20, i32 %idx1 %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx2 %v3 = insertelement <4 x i32> %v2, i32 40, i32 %idx3 ret <4 x i32> %v3 } ; Test mixed constant and dynamic insertelts with high ratio of dynamic ones. ; Should lower all insertelts to stores. define <4 x i32> @mix_dynamic_constant(i32 %idx0, i32 %idx1) { ; CHECK-LABEL: mix_dynamic_constant( ; CHECK: { ; CHECK-NEXT: .local .align 4 .b8 __local_depot6[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-NEXT: .reg .b64 %rd<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot6; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [mix_dynamic_constant_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; ; CHECK-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; ; CHECK-NEXT: st.b32 [%rd5], 10; ; CHECK-NEXT: ld.param.b32 %rd6, [mix_dynamic_constant_param_1]; ; CHECK-NEXT: and.b64 %rd7, %rd6, 3; ; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; ; CHECK-NEXT: add.s64 %rd9, %rd4, %rd8; ; CHECK-NEXT: st.b32 [%SP+4], 20; ; CHECK-NEXT: st.b32 [%rd9], 30; ; CHECK-NEXT: ld.b32 %r1, [%SP+12]; ; CHECK-NEXT: ld.b32 %r2, [%SP+8]; ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; ; CHECK-NEXT: ld.b32 %r4, [%SP]; ; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1}; ; CHECK-NEXT: ret; %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 %v2 = insertelement <4 x i32> %v1, i32 30, i32 %idx1 ret <4 x i32> %v2 } ; Test two separate chains that don't interfere define void @two_separate_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) { ; CHECK-LABEL: two_separate_chains( ; CHECK: { ; CHECK-NEXT: .local .align 4 .b8 __local_depot7[32]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-NEXT: .reg .b64 %rd<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot7; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [two_separate_chains_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; ; CHECK-NEXT: add.u64 %rd4, %SP, 16; ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; ; CHECK-NEXT: st.b32 [%rd5], 10; ; CHECK-NEXT: ld.param.b32 %rd6, [two_separate_chains_param_1]; ; CHECK-NEXT: and.b64 %rd7, %rd6, 3; ; CHECK-NEXT: shl.b64 %rd8, %rd7, 2; ; CHECK-NEXT: add.u64 %rd9, %SP, 0; ; CHECK-NEXT: add.s64 %rd10, %rd9, %rd8; ; CHECK-NEXT: ld.b32 %r1, [%SP+28]; ; CHECK-NEXT: ld.b32 %r2, [%SP+24]; ; CHECK-NEXT: ld.b32 %r3, [%SP+16]; ; CHECK-NEXT: ld.param.b64 %rd11, [two_separate_chains_param_2]; ; CHECK-NEXT: st.b32 [%rd10], 30; ; CHECK-NEXT: ld.param.b64 %rd12, [two_separate_chains_param_3]; ; CHECK-NEXT: ld.b32 %r4, [%SP+12]; ; CHECK-NEXT: ld.b32 %r5, [%SP+4]; ; CHECK-NEXT: ld.b32 %r6, [%SP]; ; CHECK-NEXT: st.v4.b32 [%rd11], {%r3, 20, %r2, %r1}; ; CHECK-NEXT: st.v4.b32 [%rd12], {%r6, %r5, 40, %r4}; ; CHECK-NEXT: ret; ; Chain 1 %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 ; Chain 2 %w0 = insertelement <4 x i32> poison, i32 30, i32 %idx1 %w1 = insertelement <4 x i32> %w0, i32 40, i32 2 store <4 x i32> %v1, ptr %out0 store <4 x i32> %w1, ptr %out1 ret void } ; Test overlapping chains (chain 2 starts from middle of chain 1) define void @overlapping_chains(i32 %idx0, i32 %idx1, ptr %out0, ptr %out1) { ; CHECK-LABEL: overlapping_chains( ; CHECK: { ; CHECK-NEXT: .local .align 4 .b8 __local_depot8[32]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-NEXT: .reg .b64 %rd<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot8; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [overlapping_chains_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 3; ; CHECK-NEXT: shl.b64 %rd3, %rd2, 2; ; CHECK-NEXT: add.u64 %rd4, %SP, 16; ; CHECK-NEXT: add.s64 %rd5, %rd4, %rd3; ; CHECK-NEXT: st.b32 [%rd5], 10; ; CHECK-NEXT: add.u64 %rd6, %SP, 0; ; CHECK-NEXT: add.s64 %rd7, %rd6, %rd3; ; CHECK-NEXT: ld.b32 %r1, [%SP+28]; ; CHECK-NEXT: ld.b32 %r2, [%SP+16]; ; CHECK-NEXT: ld.param.b64 %rd8, [overlapping_chains_param_2]; ; CHECK-NEXT: st.b32 [%rd7], 10; ; CHECK-NEXT: ld.param.b32 %rd9, [overlapping_chains_param_1]; ; CHECK-NEXT: and.b64 %rd10, %rd9, 3; ; CHECK-NEXT: shl.b64 %rd11, %rd10, 2; ; CHECK-NEXT: add.s64 %rd12, %rd6, %rd11; ; CHECK-NEXT: st.b32 [%SP+4], 20; ; CHECK-NEXT: st.b32 [%rd12], 30; ; CHECK-NEXT: ld.param.b64 %rd13, [overlapping_chains_param_3]; ; CHECK-NEXT: ld.b32 %r3, [%SP+12]; ; CHECK-NEXT: ld.b32 %r4, [%SP+8]; ; CHECK-NEXT: ld.b32 %r5, [%SP+4]; ; CHECK-NEXT: ld.b32 %r6, [%SP]; ; CHECK-NEXT: st.v4.b32 [%rd8], {%r2, 20, 40, %r1}; ; CHECK-NEXT: st.v4.b32 [%rd13], {%r6, %r5, %r4, %r3}; ; CHECK-NEXT: ret; %v0 = insertelement <4 x i32> poison, i32 10, i32 %idx0 %v1 = insertelement <4 x i32> %v0, i32 20, i32 1 ; Chain 2 starts from v1 %w0 = insertelement <4 x i32> %v1, i32 30, i32 %idx1 ; Continue chain 1 %v2 = insertelement <4 x i32> %v1, i32 40, i32 2 store <4 x i32> %v2, ptr %out0 store <4 x i32> %w0, ptr %out1 ret void } ; Test with i1 elements (1-bit, non-byte-aligned) define <8 x i1> @dynamic_i1(i32 %idx) { ; CHECK-LABEL: dynamic_i1( ; CHECK: { ; CHECK-NEXT: .local .align 8 .b8 __local_depot9[8]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot9; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i1_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; ; CHECK-NEXT: add.u64 %rd3, %SP, 0; ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; ; CHECK-NEXT: st.b8 [%rd4], 1; ; CHECK-NEXT: ld.b32 %r3, [%SP]; ; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x7773U; ; CHECK-NEXT: ld.b32 %r5, [%SP+4]; ; CHECK-NEXT: prmt.b32 %r6, %r5, 0, 0x7771U; ; CHECK-NEXT: prmt.b32 %r7, %r5, 0, 0x7772U; ; CHECK-NEXT: prmt.b32 %r8, %r5, 0, 0x7773U; ; CHECK-NEXT: st.param.b8 [func_retval0+4], %r5; ; CHECK-NEXT: st.param.b8 [func_retval0], %r3; ; CHECK-NEXT: st.param.b8 [func_retval0+7], %r8; ; CHECK-NEXT: st.param.b8 [func_retval0+6], %r7; ; CHECK-NEXT: st.param.b8 [func_retval0+5], %r6; ; CHECK-NEXT: st.param.b8 [func_retval0+3], %r4; ; CHECK-NEXT: st.param.b8 [func_retval0+2], 1; ; CHECK-NEXT: st.param.b8 [func_retval0+1], 0; ; CHECK-NEXT: ret; %v0 = insertelement <8 x i1> poison, i1 1, i32 %idx %v1 = insertelement <8 x i1> %v0, i1 0, i32 1 %v2 = insertelement <8 x i1> %v1, i1 1, i32 2 ret <8 x i1> %v2 } ; Test with i2 elements (2-bit, non-byte-aligned) define <8 x i2> @dynamic_i2(i32 %idx) { ; CHECK-LABEL: dynamic_i2( ; CHECK: { ; CHECK-NEXT: .local .align 8 .b8 __local_depot10[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b16 %rs<24>; ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot10; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i2_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; ; CHECK-NEXT: add.u64 %rd3, %SP, 0; ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; ; CHECK-NEXT: st.b8 [%rd4], 1; ; CHECK-NEXT: ld.b32 %r3, [%SP+4]; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; ; CHECK-NEXT: and.b16 %rs2, %rs1, 3; ; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x7771U; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: and.b16 %rs4, %rs3, 3; ; CHECK-NEXT: shl.b16 %rs5, %rs4, 2; ; CHECK-NEXT: or.b16 %rs6, %rs2, %rs5; ; CHECK-NEXT: prmt.b32 %r5, %r3, 0, 0x7772U; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r5; ; CHECK-NEXT: and.b16 %rs8, %rs7, 3; ; CHECK-NEXT: shl.b16 %rs9, %rs8, 4; ; CHECK-NEXT: or.b16 %rs10, %rs6, %rs9; ; CHECK-NEXT: prmt.b32 %r6, %r3, 0, 0x7773U; ; CHECK-NEXT: cvt.u16.u32 %rs11, %r6; ; CHECK-NEXT: shl.b16 %rs12, %rs11, 6; ; CHECK-NEXT: or.b16 %rs13, %rs10, %rs12; ; CHECK-NEXT: st.b8 [%SP+8], %rs13; ; CHECK-NEXT: ld.b32 %r7, [%SP]; ; CHECK-NEXT: prmt.b32 %r8, %r7, 0, 0x7773U; ; CHECK-NEXT: cvt.u16.u32 %rs14, %r8; ; CHECK-NEXT: shl.b16 %rs15, %rs14, 6; ; CHECK-NEXT: and.b16 %rs16, %rs15, 192; ; CHECK-NEXT: ld.s8 %rs17, [%SP+8]; ; CHECK-NEXT: shl.b16 %rs18, %rs17, 8; ; CHECK-NEXT: or.b16 %rs19, %rs16, %rs18; ; CHECK-NEXT: prmt.b32 %r9, %r7, 0, 0x7770U; ; CHECK-NEXT: st.param.b16 [func_retval0], %r9; ; CHECK-NEXT: st.param.b16 [func_retval0+8], %rs17; ; CHECK-NEXT: shr.s16 %rs20, %rs18, 14; ; CHECK-NEXT: st.param.b16 [func_retval0+14], %rs20; ; CHECK-NEXT: shr.s16 %rs21, %rs18, 12; ; CHECK-NEXT: st.param.b16 [func_retval0+12], %rs21; ; CHECK-NEXT: shr.s16 %rs22, %rs18, 10; ; CHECK-NEXT: st.param.b16 [func_retval0+10], %rs22; ; CHECK-NEXT: shr.s16 %rs23, %rs19, 6; ; CHECK-NEXT: st.param.b16 [func_retval0+6], %rs23; ; CHECK-NEXT: st.param.b16 [func_retval0+4], 3; ; CHECK-NEXT: st.param.b16 [func_retval0+2], 2; ; CHECK-NEXT: ret; %v0 = insertelement <8 x i2> poison, i2 1, i32 %idx %v1 = insertelement <8 x i2> %v0, i2 2, i32 1 %v2 = insertelement <8 x i2> %v1, i2 3, i32 2 ret <8 x i2> %v2 } ; Test with i3 elements (3-bit, non-byte-aligned) define <8 x i3> @dynamic_i3(i32 %idx) { ; CHECK-LABEL: dynamic_i3( ; CHECK: { ; CHECK-NEXT: .local .align 8 .b8 __local_depot11[8]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b16 %rs<5>; ; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot11; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i3_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; ; CHECK-NEXT: add.u64 %rd3, %SP, 0; ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; ; CHECK-NEXT: st.b8 [%rd4], 1; ; CHECK-NEXT: ld.b32 %r3, [%SP]; ; CHECK-NEXT: ld.b32 %r4, [%SP+4]; ; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; ; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; ; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; ; CHECK-NEXT: st.param.b32 [func_retval0+12], %r7; ; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; ; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; ; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; ; CHECK-NEXT: st.param.b32 [func_retval0+8], %r10; ; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; ; CHECK-NEXT: mov.b16 %rs2, 3; ; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; ; CHECK-NEXT: st.param.b32 [func_retval0+4], %r12; ; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; ; CHECK-NEXT: mov.b16 %rs4, 2; ; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; ; CHECK-NEXT: st.param.b32 [func_retval0], %r14; ; CHECK-NEXT: ret; %v0 = insertelement <8 x i3> poison, i3 1, i32 %idx %v1 = insertelement <8 x i3> %v0, i3 2, i32 1 %v2 = insertelement <8 x i3> %v1, i3 3, i32 2 ret <8 x i3> %v2 } ; Test with i4 elements (4-bit, non-byte-aligned) define <8 x i4> @dynamic_i4(i32 %idx) { ; CHECK-LABEL: dynamic_i4( ; CHECK: { ; CHECK-NEXT: .local .align 8 .b8 __local_depot12[16]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b16 %rs<30>; ; CHECK-NEXT: .reg .b32 %r<22>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot12; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i4_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; ; CHECK-NEXT: add.u64 %rd3, %SP, 0; ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; ; CHECK-NEXT: st.b8 [%rd4], 1; ; CHECK-NEXT: ld.b32 %r3, [%SP]; ; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x7770U; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r4; ; CHECK-NEXT: and.b16 %rs2, %rs1, 15; ; CHECK-NEXT: prmt.b32 %r5, %r3, 0, 0x7771U; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; ; CHECK-NEXT: and.b16 %rs4, %rs3, 15; ; CHECK-NEXT: shl.b16 %rs5, %rs4, 4; ; CHECK-NEXT: or.b16 %rs6, %rs2, %rs5; ; CHECK-NEXT: prmt.b32 %r6, %r3, 0, 0x7772U; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r6; ; CHECK-NEXT: and.b16 %rs8, %rs7, 15; ; CHECK-NEXT: shl.b16 %rs9, %rs8, 8; ; CHECK-NEXT: or.b16 %rs10, %rs6, %rs9; ; CHECK-NEXT: prmt.b32 %r7, %r3, 0, 0x7773U; ; CHECK-NEXT: cvt.u16.u32 %rs11, %r7; ; CHECK-NEXT: shl.b16 %rs12, %rs11, 12; ; CHECK-NEXT: or.b16 %rs13, %rs10, %rs12; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs13; ; CHECK-NEXT: ld.b32 %r9, [%SP+4]; ; CHECK-NEXT: prmt.b32 %r10, %r9, 0, 0x7770U; ; CHECK-NEXT: cvt.u16.u32 %rs14, %r10; ; CHECK-NEXT: and.b16 %rs15, %rs14, 15; ; CHECK-NEXT: prmt.b32 %r11, %r9, 0, 0x7771U; ; CHECK-NEXT: cvt.u16.u32 %rs16, %r11; ; CHECK-NEXT: and.b16 %rs17, %rs16, 15; ; CHECK-NEXT: shl.b16 %rs18, %rs17, 4; ; CHECK-NEXT: or.b16 %rs19, %rs15, %rs18; ; CHECK-NEXT: prmt.b32 %r12, %r9, 0, 0x7772U; ; CHECK-NEXT: cvt.u16.u32 %rs20, %r12; ; CHECK-NEXT: and.b16 %rs21, %rs20, 15; ; CHECK-NEXT: shl.b16 %rs22, %rs21, 8; ; CHECK-NEXT: or.b16 %rs23, %rs19, %rs22; ; CHECK-NEXT: prmt.b32 %r13, %r9, 0, 0x7773U; ; CHECK-NEXT: cvt.u16.u32 %rs24, %r13; ; CHECK-NEXT: shl.b16 %rs25, %rs24, 12; ; CHECK-NEXT: or.b16 %rs26, %rs23, %rs25; ; CHECK-NEXT: cvt.u32.u16 %r14, %rs26; ; CHECK-NEXT: shl.b32 %r15, %r14, 16; ; CHECK-NEXT: or.b32 %r16, %r8, %r15; ; CHECK-NEXT: mov.b32 %r17, {%rs20, %rs24}; ; CHECK-NEXT: st.param.b32 [func_retval0+12], %r17; ; CHECK-NEXT: mov.b32 %r18, {%rs14, %rs16}; ; CHECK-NEXT: st.param.b32 [func_retval0+8], %r18; ; CHECK-NEXT: mov.b16 %rs27, 2; ; CHECK-NEXT: mov.b32 %r19, {%rs1, %rs27}; ; CHECK-NEXT: st.param.b32 [func_retval0], %r19; ; CHECK-NEXT: shr.u32 %r20, %r16, 12; ; CHECK-NEXT: cvt.u16.u32 %rs28, %r20; ; CHECK-NEXT: mov.b16 %rs29, 3; ; CHECK-NEXT: mov.b32 %r21, {%rs29, %rs28}; ; CHECK-NEXT: st.param.b32 [func_retval0+4], %r21; ; CHECK-NEXT: ret; %v0 = insertelement <8 x i4> poison, i4 1, i32 %idx %v1 = insertelement <8 x i4> %v0, i4 2, i32 1 %v2 = insertelement <8 x i4> %v1, i4 3, i32 2 ret <8 x i4> %v2 } ; Test with i5 elements (5-bit, non-byte-aligned) define <8 x i5> @dynamic_i5(i32 %idx) { ; CHECK-LABEL: dynamic_i5( ; CHECK: { ; CHECK-NEXT: .local .align 8 .b8 __local_depot13[8]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b16 %rs<5>; ; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot13; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i5_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; ; CHECK-NEXT: add.u64 %rd3, %SP, 0; ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; ; CHECK-NEXT: st.b8 [%rd4], 1; ; CHECK-NEXT: ld.b32 %r3, [%SP]; ; CHECK-NEXT: ld.b32 %r4, [%SP+4]; ; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; ; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; ; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; ; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; ; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; ; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; ; CHECK-NEXT: st.param.v2.b32 [func_retval0+8], {%r10, %r7}; ; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; ; CHECK-NEXT: mov.b16 %rs2, 3; ; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; ; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; ; CHECK-NEXT: mov.b16 %rs4, 2; ; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r12}; ; CHECK-NEXT: ret; %v0 = insertelement <8 x i5> poison, i5 1, i32 %idx %v1 = insertelement <8 x i5> %v0, i5 2, i32 1 %v2 = insertelement <8 x i5> %v1, i5 3, i32 2 ret <8 x i5> %v2 } ; Test with i7 elements (7-bit, non-byte-aligned) define <8 x i7> @dynamic_i7(i32 %idx) { ; CHECK-LABEL: dynamic_i7( ; CHECK: { ; CHECK-NEXT: .local .align 8 .b8 __local_depot14[8]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b16 %rs<5>; ; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot14; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i7_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; ; CHECK-NEXT: add.u64 %rd3, %SP, 0; ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; ; CHECK-NEXT: st.b8 [%rd4], 1; ; CHECK-NEXT: ld.b32 %r3, [%SP]; ; CHECK-NEXT: ld.b32 %r4, [%SP+4]; ; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; ; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; ; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; ; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; ; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; ; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; ; CHECK-NEXT: st.param.v2.b32 [func_retval0+8], {%r10, %r7}; ; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; ; CHECK-NEXT: mov.b16 %rs2, 3; ; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; ; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; ; CHECK-NEXT: mov.b16 %rs4, 2; ; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r12}; ; CHECK-NEXT: ret; %v0 = insertelement <8 x i7> poison, i7 1, i32 %idx %v1 = insertelement <8 x i7> %v0, i7 2, i32 1 %v2 = insertelement <8 x i7> %v1, i7 3, i32 2 ret <8 x i7> %v2 } ; Test with i6 elements (6-bit, non-byte-aligned) define <8 x i6> @dynamic_i6(i32 %idx) { ; CHECK-LABEL: dynamic_i6( ; CHECK: { ; CHECK-NEXT: .local .align 8 .b8 __local_depot15[8]; ; CHECK-NEXT: .reg .b64 %SP; ; CHECK-NEXT: .reg .b64 %SPL; ; CHECK-NEXT: .reg .b16 %rs<5>; ; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %SPL, __local_depot15; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.param.b32 %rd1, [dynamic_i6_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 7; ; CHECK-NEXT: add.u64 %rd3, %SP, 0; ; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; ; CHECK-NEXT: st.v2.b32 [%SP], {%r1, %r2}; ; CHECK-NEXT: st.b8 [%rd4], 1; ; CHECK-NEXT: ld.b32 %r3, [%SP]; ; CHECK-NEXT: ld.b32 %r4, [%SP+4]; ; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U; ; CHECK-NEXT: prmt.b32 %r6, %r4, 0, 0x7772U; ; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; ; CHECK-NEXT: prmt.b32 %r8, %r4, 0, 0x7771U; ; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7770U; ; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x5410U; ; CHECK-NEXT: st.param.v2.b32 [func_retval0+8], {%r10, %r7}; ; CHECK-NEXT: prmt.b32 %r11, %r3, 0, 0x7773U; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r11; ; CHECK-NEXT: mov.b16 %rs2, 3; ; CHECK-NEXT: mov.b32 %r12, {%rs2, %rs1}; ; CHECK-NEXT: prmt.b32 %r13, %r3, 0, 0x7770U; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r13; ; CHECK-NEXT: mov.b16 %rs4, 2; ; CHECK-NEXT: mov.b32 %r14, {%rs3, %rs4}; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r12}; ; CHECK-NEXT: ret; %v0 = insertelement <8 x i6> poison, i6 1, i32 %idx %v1 = insertelement <8 x i6> %v0, i6 2, i32 1 %v2 = insertelement <8 x i6> %v1, i6 3, i32 2 ret <8 x i6> %v2 } ; Test with multiple dynamic insertions on i3 elements define <4 x i3> @multiple_dynamic_i3(i32 %idx0, i32 %idx1) { ; CHECK-LABEL: multiple_dynamic_i3( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [multiple_dynamic_i3_param_0]; ; CHECK-NEXT: shl.b32 %r2, %r1, 3; ; CHECK-NEXT: bfi.b32 %r3, 1, %r4, %r2, 8; ; CHECK-NEXT: ld.param.b32 %r5, [multiple_dynamic_i3_param_1]; ; CHECK-NEXT: shl.b32 %r6, %r5, 3; ; CHECK-NEXT: bfi.b32 %r7, 2, %r3, %r6, 8; ; CHECK-NEXT: st.param.b16 [func_retval0], %r7; ; CHECK-NEXT: shr.u32 %r8, %r7, 16; ; CHECK-NEXT: st.param.b16 [func_retval0+2], %r8; ; CHECK-NEXT: ret; %v0 = insertelement <4 x i3> poison, i3 1, i32 %idx0 %v1 = insertelement <4 x i3> %v0, i3 2, i32 %idx1 ret <4 x i3> %v1 }