diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 29 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/true16-fold.mir | 149 |
3 files changed, 163 insertions, 23 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2bfc37b..d6acf9e 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1899,7 +1899,13 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) { if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg())) return false; - MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); + if (!ClampSrc->getReg().isVirtual()) + return false; + + // Look through COPY. COPY only observed with True16. + Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI); + MachineInstr *Def = + MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg()); // The type of clamp must be compatible. if (TII->getClampMask(*Def) != TII->getClampMask(MI)) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 15cb404..beac41e 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -269,19 +269,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2 } define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 { -; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; SDAG-GFX1100-TRUE16: ; %bb.0: -; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp -; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; SDAG-GFX1100-FAKE16: ; %bb.0: -; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GFX900: ; %bb.0: @@ -312,12 +304,6 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; GISEL-GFX1100: ; %bb.0: -; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1524,10 +1510,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir index ef6e400..93cc12f 100644 --- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -57,4 +57,153 @@ body: | %4:vgpr_16 = COPY %3:sgpr_lo16 %5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec S_ENDPGM 0, implicit %5 + +--- +name: fold_16bit_madmix_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_madmix_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]] + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIXLO_F16_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_16 = COPY %5 + %7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %7 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_subreg_1_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_subreg_1_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %5.lo16, 0, %5.lo16, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %6 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_subreg_2_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_subreg_2_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_16 = COPY %5.lo16 + %7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %7 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_phyreg_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_phyreg_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr10_lo16 = COPY [[V_FMA_MIXLO_F16_]] + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + $vgpr10_lo16 = COPY %5 + %6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %6 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_undef_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_undef_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[DEF]], 0, [[DEF]], -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:vgpr_16 = IMPLICIT_DEF + %4:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %3, 0, %3, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %4 + S_ENDPGM 0, implicit $vgpr0 ... |