diff options
author | Tamar Christina <tamar.christina@arm.com> | 2023-10-18 09:32:55 +0100 |
---|---|---|
committer | Tamar Christina <tamar.christina@arm.com> | 2023-10-18 09:53:47 +0100 |
commit | 4b39aeef594f311e2c1715f15608f1d7ebc2d868 (patch) | |
tree | bc0f36b93c71bb595b54948fd82d3ecc4cc6bb7d | |
parent | b588dcb77e96d77777eb5647cba9e8f454e314dc (diff) | |
download | gcc-4b39aeef594f311e2c1715f15608f1d7ebc2d868.zip gcc-4b39aeef594f311e2c1715f15608f1d7ebc2d868.tar.gz gcc-4b39aeef594f311e2c1715f15608f1d7ebc2d868.tar.bz2 |
middle-end: Fold vec_cond into conditional ternary or binary operation when sharing operand [PR109154]
When we have a vector conditional on a masked target which is doing a selection
on the result of a conditional operation where one of the operands of the
conditional operation is the other operand of the select, then we can fold the
vector conditional into the operation.
Concretely this transforms
c = mask1 ? (masked_op mask2 a b) : b
into
c = masked_op (mask1 & mask2) a b
The mask is then propagated upwards by the compiler. In the SVE case we don't
end up needing a mask AND here since `mask2` will end up in the instruction
creating `mask` which gives us a natural &.
Such transformations are more common now in GCC 13+ as PRE has not started
unsharing of common code in case it can make one branch fully independent.
e.g. in this case `b` becomes a loop invariant value after PRE.
This transformation removes the extra select for masked architectures but
doesn't fix the general case.
gcc/ChangeLog:
PR tree-optimization/109154
* match.pd: Add new cond_op rule.
gcc/testsuite/ChangeLog:
PR tree-optimization/109154
* gcc.target/aarch64/sve/pre_cond_share_1.c: New test.
-rw-r--r-- | gcc/match.pd | 24 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/pre_cond_share_1.c | 132 |
2 files changed, 156 insertions, 0 deletions
diff --git a/gcc/match.pd b/gcc/match.pd index 067328a..a56838f 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -8970,6 +8970,30 @@ and, && fold_real_zero_addition_p (type, NULL_TREE, @5, 0))) (IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4))) +/* Detect simplification for vector condition folding where + + c = mask1 ? (masked_op mask2 a b) : b + + into + + c = masked_op (mask1 & mask2) a b + + where the operation can be partially applied to one operand. */ + +(for cond_op (COND_BINARY) + (simplify + (vec_cond @0 + (cond_op:s @1 @2 @3 @4) @3) + (cond_op (bit_and @1 @0) @2 @3 @4))) + +/* And same for ternary expressions. */ + +(for cond_op (COND_TERNARY) + (simplify + (vec_cond @0 + (cond_op:s @1 @2 @3 @4 @5) @4) + (cond_op (bit_and @1 @0) @2 @3 @4 @5))) + /* For pointers @0 and @2 and nonnegative constant offset @1, look for expressions like: diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pre_cond_share_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pre_cond_share_1.c new file mode 100644 index 0000000..b51d0f2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pre_cond_share_1.c @@ -0,0 +1,132 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-optimized" } */ + +#include <stdint.h> +#include <stddef.h> +#include <math.h> +#include <float.h> + +typedef struct __attribute__((__packed__)) _Atom { + float x, y, z; + int32_t type; +} Atom; + +typedef struct __attribute__((__packed__)) _FFParams { + int32_t hbtype; + float radius; + float hphb; + float elsc; +} FFParams; + +#ifndef PPWI +#define PPWI (64) +#endif + +#ifndef ITERS +#define ITERS 8 +#endif + +#define DIFF_TOLERANCE_PCT 0.025f + +#define POSES_SIZE 393216 +#define PROTEIN_SIZE 938 +#define LIGAND_SIZE 26 +#define FORCEFIELD_SIZE 34 + +#define ZERO 0.0f +#define QUARTER 0.25f +#define HALF 0.5f +#define ONE 1.0f +#define TWO 2.0f +#define FOUR 4.0f +#define CNSTNT 45.0f + +// Energy evaluation parameters +#define HBTYPE_F 70 +#define HBTYPE_E 69 +#define HARDNESS 38.0f +#define NPNPDIST 5.5f +#define NPPDIST 1.0f + +void +fasten_main(size_t group, size_t ntypes, size_t nposes, size_t natlig, size_t natpro, // + const Atom *protein, const Atom *ligand, // + const float *transforms_0, const float *transforms_1, const float *transforms_2, // + const float *transforms_3, const float *transforms_4, const float *transforms_5, // + const FFParams *forcefield, float *energies // +) { + + float etot[PPWI]; + float lpos_x[PPWI]; + + for (int l = 0; l < PPWI; l++) { + etot[l] = 0.f; + lpos_x[l] = 0.f; + } + + // Loop over ligand atoms + for (int il = 0; il < natlig; il++) { + // Load ligand atom data + const Atom l_atom = ligand[il]; + const FFParams l_params = forcefield[l_atom.type]; + const int lhphb_ltz = l_params.hphb < 0.f; + const int lhphb_gtz = l_params.hphb > 0.f; + + // Transform ligand atom + + // Loop over protein atoms + for (int ip = 0; ip < natpro; ip++) { + // Load protein atom data + const Atom p_atom = protein[ip]; + const FFParams p_params = forcefield[p_atom.type]; + + const float radij = p_params.radius + l_params.radius; + const float r_radij = ONE / radij; + + const float elcdst = (p_params.hbtype == HBTYPE_F && l_params.hbtype == HBTYPE_F) ? FOUR + : TWO; + const float elcdst1 = (p_params.hbtype == HBTYPE_F && l_params.hbtype == HBTYPE_F) + ? QUARTER : HALF; + const int type_E = ((p_params.hbtype == HBTYPE_E || l_params.hbtype == HBTYPE_E)); + + const int phphb_ltz = p_params.hphb < 0.f; + const int phphb_gtz = p_params.hphb > 0.f; + const int phphb_nz = p_params.hphb != 0.f; + const float p_hphb = p_params.hphb * (phphb_ltz && lhphb_gtz ? -ONE : ONE); + const float l_hphb = l_params.hphb * (phphb_gtz && lhphb_ltz ? -ONE : ONE); + const float distdslv = (phphb_ltz ? (lhphb_ltz ? NPNPDIST : NPPDIST) : (lhphb_ltz + ? NPPDIST + : -FLT_MAX)); + const float r_distdslv = ONE / distdslv; + + const float chrg_init = l_params.elsc * p_params.elsc; + const float dslv_init = p_hphb + l_hphb; + + for (int l = 0; l < PPWI; l++) { + // Calculate distance between atoms + const float x = lpos_x[l] - p_atom.x; + const float distij = (x * x); + + // Calculate the sum of the sphere radii + const float distbb = distij - radij; + + const int zone1 = (distbb < ZERO); + + // Calculate formal and dipole charge interactions + float chrg_e = chrg_init * ((zone1 ? ONE : (ONE - distbb * elcdst1)) * + (distbb < elcdst ? ONE : ZERO)); + float neg_chrg_e = -fabsf(chrg_e); + chrg_e = type_E ? neg_chrg_e : chrg_e; + etot[l] += chrg_e * CNSTNT; + } + } + } + + // Write result + for (int l = 0; l < PPWI; l++) { + energies[group * PPWI + l] = etot[l] * HALF; + } +} + +/* { dg-final { scan-tree-dump-times {\.COND_MUL} 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times {\.VCOND} 1 "optimized" } } */ |