From bf257c548cb71dea6da9c11d154f629c65dc2275 Mon Sep 17 00:00:00 2001
From: Edwin Lu <ewlu@rivosinc.com>
Date: Tue, 5 Aug 2025 16:24:39 -0700
Subject: Match: Support SAT_TRUNC variant NARROW_CLIP

This patch tries to add support for a variant of SAT_TRUNC where
negative numbers are clipped to 0 instead of NARROW_TYPE_MAX_VALUE.
This form is seen in x264, aka

UT clip (T a)
{
  return a & (UT)(-1) ? (-a) >> 31 : a;
}

Where sizeof(UT) < sizeof(T)

I'm unable to get the SAT_TRUNC pattern to appear on x86_64, however it
does appear when building for riscv as seen below:

Before this patch:
  <bb 3> [local count: 764504183]:
  # i_21 = PHI <i_14(8), 0(15)>
  # vectp_x.10_54 = PHI <vectp_x.10_55(8), x_10(D)(15)>
  # vectp_res.20_66 = PHI <vectp_res.20_67(8), res_11(D)(15)>
  # ivtmp_70 = PHI <ivtmp_71(8), _69(15)>
  _72 = .SELECT_VL (ivtmp_70, POLY_INT_CST [4, 4]);
  _1 = (long unsigned int) i_21;
  _2 = _1 * 4;
  _3 = x_10(D) + _2;
  ivtmp_53 = _72 * 4;
  vect__4.12_57 = .MASK_LEN_LOAD (vectp_x.10_54, 32B, { -1, ... }, _56(D), _72, 0);
  vect_x.13_58 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned int>(vect__4.12_57);
  vect__38.15_60 = -vect_x.13_58;
  vect__15.16_61 = VIEW_CONVERT_EXPR<vector([4,4]) int>(vect__38.15_60);
  vect__16.17_62 = vect__15.16_61 >> 31;
  mask__29.14_59 = vect_x.13_58 > { 255, ... };
  vect__17.18_63 = VEC_COND_EXPR <mask__29.14_59, vect__16.17_62, vect__4.12_57>;
  vect__18.19_64 = (vector([4,4]) unsigned char) vect__17.18_63;
  _4 = *_3;
  _5 = res_11(D) + _1;
  x.0_12 = (unsigned int) _4;
  _38 = -x.0_12;
  _15 = (int) _38;
  _16 = _15 >> 31;
  _29 = x.0_12 > 255;
  _17 = _29 ? _16 : _4;
  _18 = (unsigned char) _17;
  .MASK_LEN_STORE (vectp_res.20_66, 8B, { -1, ... }, _72, 0, vect__18.19_64);
  i_14 = i_21 + 1;
  vectp_x.10_55 = vectp_x.10_54 + ivtmp_53;
  vectp_res.20_67 = vectp_res.20_66 + _72;
  ivtmp_71 = ivtmp_70 - _72;
  if (ivtmp_71 != 0)
    goto <bb 8>; [89.00%]
  else
    goto <bb 17>; [11.00%]

After this patch:

 <bb 3> [local count: 764504183]:
  # i_21 = PHI <i_14(8), 0(15)>
  # vectp_x.10_68 = PHI <vectp_x.10_69(8), x_10(D)(15)>
  # vectp_res.15_75 = PHI <vectp_res.15_76(8), res_11(D)(15)>
  # ivtmp_79 = PHI <ivtmp_80(8), _78(15)>
  _81 = .SELECT_VL (ivtmp_79, POLY_INT_CST [4, 4]);
  _1 = (long unsigned int) i_21;
  _2 = _1 * 4;
  _3 = x_10(D) + _2;
  ivtmp_67 = _81 * 4;
  vect__4.12_71 = .MASK_LEN_LOAD (vectp_x.10_68, 32B, { -1, ... }, _70(D), _81, 0);
  vect_patt_37.13_72 = MAX_EXPR <{ 0, ... }, vect__4.12_71>;
  vect_patt_39.14_73 = .SAT_TRUNC (vect_patt_37.13_72);
  _4 = *_3;
  _5 = res_11(D) + _1;
  x.0_12 = (unsigned int) _4;
  _38 = -x.0_12;
  _15 = (int) _38;
  _16 = _15 >> 31;
  _29 = x.0_12 > 255;
  _17 = _29 ? _16 : _4;
  _18 = (unsigned char) _17;
  .MASK_LEN_STORE (vectp_res.15_75, 8B, { -1, ... }, _81, 0, vect_patt_39.14_73);
  i_14 = i_21 + 1;
  vectp_x.10_69 = vectp_x.10_68 + ivtmp_67;
  vectp_res.15_76 = vectp_res.15_75 + _81;
  ivtmp_80 = ivtmp_79 - _81;
  if (ivtmp_80 != 0)
    goto <bb 8>; [89.00%]
  else
    goto <bb 17>; [11.00%]

gcc/ChangeLog:

	* match.pd: New NARROW_CLIP variant for SAT_TRUNC.
	* tree-vect-patterns.cc (gimple_unsigned_integer_narrow_clip):
	Add new decl for NARROW_CLIP.
	(vect_recog_sat_trunc_pattern): Add NARROW_CLIP check.

Signed-off-by: Edwin Lu <ewlu@rivosinc.com>
---
 gcc/match.pd              | 26 ++++++++++++++++++++++++++
 gcc/tree-vect-patterns.cc | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)
diff --git a/gcc/match.pd b/gcc/match.pd
index f61beb6..dbbb7b5 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3369,6 +3369,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     }
     (if (wi::eq_p (sum, wi::uhwi (0, precision))))))))
 
+(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
+ (match (unsigned_integer_narrow_clip @0)
+  /* SAT_U_TRUNC<NT>(MAX (0, X)) =
+	     (UT)X & ~(NT)(-1) ? (-X) >> TYPE_PRECISION(X) - 1 : X
+
+     The gimple representation uses X > ~(NT)(-1) instead of
+     using & so match on gt instead of bit_and.  */
+  (convert (cond^ (gt (nop_convert? @0) INTEGER_CST@1)
+	 (rshift:s (nop_convert? (negate (nop_convert? @0))) INTEGER_CST@2)
+	 @0))
+  (if (! TYPE_UNSIGNED (TREE_TYPE (@0)))
+   (with
+    {
+     unsigned itype_precision = TYPE_PRECISION (TREE_TYPE (@0));
+     unsigned otype_precision = TYPE_PRECISION (type);
+     wide_int trunc_max = wi::mask (otype_precision, false, itype_precision);
+     wide_int int_cst_1 = wi::to_wide (@1, itype_precision);
+     wide_int int_cst_2 = wi::to_wide (@2, itype_precision);
+     wide_int shift_amount = wi::uhwi (itype_precision - 1,
+				  itype_precision); // AKA itype_bits - 1
+     int cmp = 0;
+     cmp = wi::cmp (int_cst_2, shift_amount, TYPE_SIGN (TREE_TYPE (@0)));
+    }
+    (if (otype_precision < itype_precision && wi::eq_p (trunc_max,
+    int_cst_1) && (cmp >= 0)))))))
+
 /* Saturation truncate for unsigned integer.  */
 (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
  (match (unsigned_integer_sat_trunc @0)
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index b39da10..70bf768 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -4538,6 +4538,8 @@ extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
 extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree));
 extern bool gimple_unsigned_integer_sat_trunc (tree, tree*, tree (*)(tree));
 
+extern bool gimple_unsigned_integer_narrow_clip (tree, tree*, tree (*)(tree));
+
 extern bool gimple_signed_integer_sat_add (tree, tree*, tree (*)(tree));
 extern bool gimple_signed_integer_sat_sub (tree, tree*, tree (*)(tree));
 extern bool gimple_signed_integer_sat_trunc (tree, tree*, tree (*)(tree));
@@ -4772,6 +4774,36 @@ vect_recog_sat_trunc_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
   tree lhs = gimple_assign_lhs (last_stmt);
   tree otype = TREE_TYPE (lhs);
 
+  if ((gimple_unsigned_integer_narrow_clip (lhs, ops, NULL))
+       && type_has_mode_precision_p (otype))
+    {
+      tree itype = TREE_TYPE (ops[0]);
+      tree v_itype = get_vectype_for_scalar_type (vinfo, itype);
+      tree v_otype = get_vectype_for_scalar_type (vinfo, otype);
+      internal_fn fn = IFN_SAT_TRUNC;
+
+      if (v_itype != NULL_TREE && v_otype != NULL_TREE
+	&& direct_internal_fn_supported_p (fn, tree_pair (v_otype, v_itype),
+					   OPTIMIZE_FOR_BOTH))
+	{
+	  tree temp = vect_recog_temp_ssa_var (itype, NULL);
+	  gimple * max_stmt = gimple_build_assign (temp, build2 (MAX_EXPR, itype, build_zero_cst(itype), ops[0]));
+	  append_pattern_def_seq (vinfo, stmt_vinfo, max_stmt, v_itype);
+
+	  gcall *call = gimple_build_call_internal (fn, 1, temp);
+	  tree out_ssa = vect_recog_temp_ssa_var (otype, NULL);
+
+	  gimple_call_set_lhs (call, out_ssa);
+	  gimple_call_set_nothrow (call, /* nothrow_p */ false);
+	  gimple_set_location (call, gimple_location (last_stmt));
+
+	  *type_out = v_otype;
+
+	  return call;
+	}
+
+    }
+
   if ((gimple_unsigned_integer_sat_trunc (lhs, ops, NULL)
        || gimple_signed_integer_sat_trunc (lhs, ops, NULL))
       && type_has_mode_precision_p (otype))
-- 
cgit v1.1