diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 17 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 53 |
2 files changed, 45 insertions, 25 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index e8b211f..7f00ead 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[ combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask ]>; +// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x)) +def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32), + [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)), + (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>; + +def combine_or_s64_s32 : GICombineRule< + (defs root:$dst), + (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst), + (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x), + (G_OR $or, $x_lo, $y), + (G_MERGE_VALUES $dst, $or, $x_hi))>; + let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This // saves one instruction compared to the promotion. @@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner< "AMDGPUPreLegalizerCombinerImpl", [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16, foldable_fneg, combine_shuffle_vector_to_build_vector, - binop_s64_with_s32_mask_combines]> { + binop_s64_with_s32_mask_combines, combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } @@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64, - binop_s64_with_s32_mask_combines]> { + binop_s64_with_s32_mask_combines, combine_or_s64_s32]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 596a895..1a13b22 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -976,9 +976,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, FPOpActions.clampMaxNumElementsStrict(0, S32, 2); } + auto &MinNumMaxNumIeee = + getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); + + if (ST.hasVOP3PInsts()) { + MinNumMaxNumIeee.legalFor(FPTypesPK16) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampMaxNumElements(0, S16, 2) + .clampScalar(0, S16, S64) + .scalarize(0); + } else if (ST.has16BitInsts()) { + MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0); + } else { + MinNumMaxNumIeee.legalFor(FPTypesBase) + .clampScalar(0, S32, S64) + .scalarize(0); + } + auto &MinNumMaxNum = getActionDefinitionsBuilder( - {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE, - G_FMAXNUM_IEEE}); + {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM}); if (ST.hasVOP3PInsts()) { MinNumMaxNum.customFor(FPTypesPK16) @@ -2136,9 +2152,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor(FPTypesPK16) .clampMaxNumElements(0, S16, 2) .scalarize(0); + } else if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) + .lowerFor({V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .lower(); } else { - // TODO: Implement - getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); + getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) + .scalarize(0) + .clampScalar(0, S32, S64) + .lower(); } getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) @@ -2195,8 +2219,6 @@ bool AMDGPULegalizerInfo::legalizeCustom( case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FMINIMUMNUM: case TargetOpcode::G_FMAXIMUMNUM: - case TargetOpcode::G_FMINNUM_IEEE: - case TargetOpcode::G_FMAXNUM_IEEE: return legalizeMinNumMaxNum(Helper, MI); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return legalizeExtractVectorElt(MI, MRI, B); @@ -2817,23 +2839,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineFunction &MF = Helper.MIRBuilder.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || - MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; - - // With ieee_mode disabled, the instructions have the correct behavior - // already for G_FMINIMUMNUM/G_FMAXIMUMNUM. - // - // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode - // enabled. - if (!MFI->getMode().IEEE) { - if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM || - MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM) - return true; - - return !IsIEEEOp; - } - - if (IsIEEEOp) + // With ieee_mode disabled, the instructions have the correct behavior. + if (!MFI->getMode().IEEE) return true; return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; |
