; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s ; ; kernel void combine_vloads(global char8 addrspace(5)* src, global char8 addrspace(5)* result) { ; for (int i = 0; i < 1024; ++i) ; result[i] = src[0] + src[1] + src[2] + src[3]; ; } ; ; 128-bit loads instead of many 8-bit define amdgpu_kernel void @combine_vloads(ptr addrspace(1) nocapture %src, ptr addrspace(1) nocapture %result) nounwind { ; EG-LABEL: combine_vloads: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 3, @16, KC0[CB0:0-32], KC1[] ; EG-NEXT: LOOP_START_DX10 @10 ; EG-NEXT: TEX 1 @12 ; EG-NEXT: ALU 86, @20, KC0[], KC1[] ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XY, T15.X, 0 ; EG-NEXT: ALU_PUSH_BEFORE 4, @107, KC0[], KC1[] ; EG-NEXT: JUMP @9 POP:1 ; EG-NEXT: LOOP_BREAK @9 ; EG-NEXT: POP @9 POP:1 ; EG-NEXT: END_LOOP @2 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 12: ; EG-NEXT: VTX_READ_128 T14.XYZW, T13.X, 0, #1 ; EG-NEXT: VTX_READ_128 T15.XYZW, T13.X, 16, #1 ; EG-NEXT: ALU clause starting at 16: ; EG-NEXT: MOV T13.X, KC0[2].Y, ; EG-NEXT: MOV T0.W, KC0[2].Z, ; EG-NEXT: MOV * T1.W, literal.x, ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 20: ; EG-NEXT: LSHR T2.W, T14.Y, literal.x, ; EG-NEXT: LSHR * T3.W, T14.W, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: ADD_INT T2.W, PV.W, PS, ; EG-NEXT: LSHR * T3.W, T15.Y, literal.x, ; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T0.Y, T14.Y, literal.x, ; EG-NEXT: LSHR T0.Z, T14.W, literal.x, ; EG-NEXT: ADD_INT T2.W, PV.W, PS, ; EG-NEXT: LSHR * T3.W, T15.W, literal.y, ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) ; EG-NEXT: ADD_INT T16.X, PV.W, PS, ; EG-NEXT: ADD_INT T0.Y, PV.Y, PV.Z, ; EG-NEXT: LSHR T0.Z, T15.Y, literal.x, ; EG-NEXT: LSHR T2.W, T14.X, literal.y, ; EG-NEXT: LSHR * T3.W, T14.Z, literal.y, ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) ; EG-NEXT: ADD_INT T17.X, PV.W, PS, ; EG-NEXT: ADD_INT T0.Y, PV.Y, PV.Z, ; EG-NEXT: LSHR T0.Z, T15.W, literal.x, ; EG-NEXT: LSHR T2.W, T14.Y, literal.y, ; EG-NEXT: LSHR * T3.W, T14.W, literal.y, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) ; EG-NEXT: LSHR T18.X, T15.X, literal.x, ; EG-NEXT: LSHR T1.Y, T14.X, literal.y, BS:VEC_120/SCL_212 ; EG-NEXT: ADD_INT T1.Z, PV.W, PS, ; EG-NEXT: LSHR T2.W, T15.Y, literal.z, ; EG-NEXT: ADD_INT * T3.W, PV.Y, PV.Z, ; EG-NEXT: 24(3.363116e-44), 8(1.121039e-44) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: LSHR T19.X, T14.Z, literal.x, ; EG-NEXT: ADD_INT T0.Y, T14.Y, T14.W, ; EG-NEXT: AND_INT T0.Z, PS, literal.y, ; EG-NEXT: ADD_INT T2.W, PV.Z, PV.W, ; EG-NEXT: LSHR * T3.W, T15.W, literal.z, ; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: ADD_INT T20.X, PV.W, PS, ; EG-NEXT: LSHL T2.Y, PV.Z, literal.x, ; EG-NEXT: ADD_INT T0.Z, PV.Y, T15.Y, ; EG-NEXT: ADD_INT T2.W, T1.Y, PV.X, ; EG-NEXT: LSHR * T3.W, T15.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: ADD_INT T19.X, T14.X, T14.Z, ; EG-NEXT: ADD_INT T0.Y, PV.W, PS, ; EG-NEXT: LSHR T1.Z, T15.Z, literal.x, ; EG-NEXT: LSHR T2.W, T14.X, literal.y, ; EG-NEXT: LSHR * T3.W, T14.Z, literal.y, ; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) ; EG-NEXT: ADD_INT T14.X, PV.W, PS, ; EG-NEXT: LSHR T1.Y, T15.X, literal.x, ; EG-NEXT: ADD_INT T1.Z, PV.Y, PV.Z, ; EG-NEXT: ADD_INT T2.W, PV.X, T15.X, ; EG-NEXT: ADD_INT * T3.W, T0.Z, T15.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: AND_INT T15.X, PS, literal.x, ; EG-NEXT: ADD_INT T0.Y, PV.W, T15.Z, ; EG-NEXT: AND_INT T0.Z, PV.Z, literal.x, ; EG-NEXT: ADD_INT T2.W, PV.X, PV.Y, ; EG-NEXT: LSHR * T3.W, T15.Z, literal.y, ; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) ; EG-NEXT: ADD_INT T14.X, PV.W, PS, ; EG-NEXT: LSHL T1.Y, PV.Z, literal.x, ; EG-NEXT: AND_INT T0.Z, PV.Y, literal.y, ; EG-NEXT: OR_INT T2.W, PV.X, T2.Y, ; EG-NEXT: LSHL * T3.W, T20.X, literal.z, ; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: OR_INT T15.X, PV.W, PS, ; EG-NEXT: OR_INT T0.Y, PV.Z, PV.Y, ; EG-NEXT: LSHL T0.Z, PV.X, literal.x, ; EG-NEXT: ADD_INT T2.W, T17.X, T18.X, ; EG-NEXT: LSHR * T3.W, T15.Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) ; EG-NEXT: ADD_INT T1.Y, PV.W, PS, ; EG-NEXT: OR_INT T0.Z, PV.Y, PV.Z, ; EG-NEXT: AND_INT T2.W, PV.X, literal.x, ; EG-NEXT: LSHL * T3.W, T16.X, literal.y, ; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) ; EG-NEXT: OR_INT T14.Y, PV.W, PS, ; EG-NEXT: AND_INT T2.W, PV.Z, literal.x, ; EG-NEXT: LSHL * T3.W, PV.Y, literal.y, ; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) ; EG-NEXT: OR_INT T14.X, PV.W, PS, ; EG-NEXT: ADD_INT * T2.W, T0.W, T1.W, ; EG-NEXT: LSHR * T15.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; EG-NEXT: ALU clause starting at 107: ; EG-NEXT: ADD_INT * T1.W, T1.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) ; EG-NEXT: SETE_INT * T2.W, PV.W, literal.x, ; EG-NEXT: 8192(1.147944e-41), 0(0.000000e+00) ; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, entry: br label %for.body for.exit: ; preds = %for.body ret void for.body: ; preds = %for.body, %entry %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ] %vecload2 = load <8 x i32>, ptr addrspace(1) %src, align 32 %0 = bitcast <8 x i32> %vecload2 to <32 x i8> %tmp5 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> %tmp8 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> %tmp9 = add nsw <8 x i8> %tmp5, %tmp8 %tmp12 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> %tmp13 = add nsw <8 x i8> %tmp9, %tmp12 %tmp16 = shufflevector <32 x i8> %0, <32 x i8> poison, <8 x i32> %tmp17 = add nsw <8 x i8> %tmp13, %tmp16 %scevgep = getelementptr <8 x i8>, ptr addrspace(1) %result, i32 %i.01 %1 = bitcast <8 x i8> %tmp17 to <2 x i32> store <2 x i32> %1, ptr addrspace(1) %scevgep, align 8 %tmp19 = add nsw i32 %i.01, 1 %exitcond = icmp eq i32 %tmp19, 1024 br i1 %exitcond, label %for.exit, label %for.body }