aboutsummaryrefslogtreecommitdiff
path: root/gcc/fortran/frontend-passes.c
diff options
context:
space:
mode:
authorThomas Koenig <tkoenig@gcc.gnu.org>2018-09-18 20:18:09 +0000
committerThomas Koenig <tkoenig@gcc.gnu.org>2018-09-18 20:18:09 +0000
commit998511a6100212931d039e3a46403d2d878c8e5a (patch)
tree70e69614c308f81a76996bb4aac9fafffce58837 /gcc/fortran/frontend-passes.c
parent5c470e0f07563c3ab540c0a9f40b8a6ac4a29f07 (diff)
downloadgcc-998511a6100212931d039e3a46403d2d878c8e5a.zip
gcc-998511a6100212931d039e3a46403d2d878c8e5a.tar.gz
gcc-998511a6100212931d039e3a46403d2d878c8e5a.tar.bz2
re PR fortran/29550 (Optimize -fexternal-blas calls for conjg())
2018-09-18 Thomas Koenig <tkoenig@gcc.gnu.org> PR fortran/29550 * gfortran.h (gfc_expr): Add external_blas flag. * frontend-passes.c (matrix_case): Add case A2TB2T. (optimize_namespace): Handle flag_external_blas by calling call_external_blas. (get_array_inq_function): Add argument okind. If it is nonzero, use it as the kind of argument to be used. (inline_limit_check): Remove m_case argument, add limit argument instead. Remove assert about m_case. Set the limit for inlining from the limit argument. (matmul_lhs_realloc): Handle case A2TB2T. (inline_matmul_assign): Handle inline limit for other cases with two rank-two matrices. Remove no-op calls to inline_limit_check. (call_external_blas): New function. * trans-intrinsic.c (gfc_conv_intrinsic_funcall): Do not add argument to external BLAS if external_blas is already set. 2018-09-18 Thomas Koenig <tkoenig@gcc.gnu.org> PR fortran/29550 * gfortran.dg/inline_matmul_13.f90: Adjust count for _gfortran_matmul. * gfortran.dg/inline_matmul_16.f90: Likewise. * gfortran.dg/promotion_2.f90: Add -fblas-matmul-limit=1. Scan for dgemm instead of dgemm_. Add call to random_number to make standard conforming. * gfortran.dg/matmul_blas_1.f90: New test. * gfortran.dg/matmul_bounds_14.f: New test. * gfortran.dg/matmul_bounds_15.f: New test. * gfortran.dg/matmul_bounds_16.f: New test. * gfortran.dg/blas_gemm_routines.f: New test / additional file for preceding tests. From-SVN: r264412
Diffstat (limited to 'gcc/fortran/frontend-passes.c')
-rw-r--r--gcc/fortran/frontend-passes.c456
1 files changed, 439 insertions, 17 deletions
diff --git a/gcc/fortran/frontend-passes.c b/gcc/fortran/frontend-passes.c
index 80a65fc..2a65b52 100644
--- a/gcc/fortran/frontend-passes.c
+++ b/gcc/fortran/frontend-passes.c
@@ -53,6 +53,7 @@ static gfc_code * create_do_loop (gfc_expr *, gfc_expr *, gfc_expr *,
char *vname=NULL);
static gfc_expr* check_conjg_transpose_variable (gfc_expr *, bool *,
bool *);
+static int call_external_blas (gfc_code **, int *, void *);
static bool has_dimen_vector_ref (gfc_expr *);
static int matmul_temp_args (gfc_code **, int *,void *data);
static int index_interchange (gfc_code **, int*, void *);
@@ -131,7 +132,7 @@ static int var_num = 1;
/* What sort of matrix we are dealing with when inlining MATMUL. */
-enum matrix_case { none=0, A2B2, A2B1, A1B2, A2B2T, A2TB2 };
+enum matrix_case { none=0, A2B2, A2B1, A1B2, A2B2T, A2TB2, A2TB2T };
/* Keep track of the number of expressions we have inserted so far
using create_var. */
@@ -1428,7 +1429,7 @@ optimize_namespace (gfc_namespace *ns)
gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL);
gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL);
gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL);
- if (flag_inline_matmul_limit != 0)
+ if (flag_inline_matmul_limit != 0 || flag_external_blas)
{
bool found;
do
@@ -1441,9 +1442,15 @@ optimize_namespace (gfc_namespace *ns)
gfc_code_walker (&ns->code, matmul_temp_args, dummy_expr_callback,
NULL);
- gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
- NULL);
}
+
+ if (flag_external_blas)
+ gfc_code_walker (&ns->code, call_external_blas, dummy_expr_callback,
+ NULL);
+
+ if (flag_inline_matmul_limit != 0)
+ gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
+ NULL);
}
if (flag_frontend_loop_interchange)
@@ -2938,7 +2945,7 @@ matmul_temp_args (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
dim is zero-based. */
static gfc_expr *
-get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim)
+get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim, int okind = 0)
{
gfc_expr *fcn;
gfc_expr *dim_arg, *kind;
@@ -2964,8 +2971,12 @@ get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim)
}
dim_arg = gfc_get_int_expr (gfc_default_integer_kind, &e->where, dim);
- kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
- gfc_index_integer_kind);
+ if (okind != 0)
+ kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
+ okind);
+ else
+ kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
+ gfc_index_integer_kind);
ec = gfc_copy_expr (e);
@@ -3026,7 +3037,7 @@ get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2)
removed by DCE. Only called for rank-two matrices A and B. */
static gfc_code *
-inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case)
+inline_limit_check (gfc_expr *a, gfc_expr *b, int limit)
{
gfc_expr *inline_limit;
gfc_code *if_1, *if_2, *else_2;
@@ -3034,14 +3045,11 @@ inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case)
gfc_typespec ts;
gfc_expr *cond;
- gcc_assert (m_case == A2B2 || m_case == A2B2T || m_case == A2TB2);
-
/* Calculation is done in real to avoid integer overflow. */
inline_limit = gfc_get_constant_expr (BT_REAL, gfc_default_real_kind,
&a->where);
- mpfr_set_si (inline_limit->value.real, flag_inline_matmul_limit,
- GFC_RND_MODE);
+ mpfr_set_si (inline_limit->value.real, limit, GFC_RND_MODE);
mpfr_pow_ui (inline_limit->value.real, inline_limit->value.real, 3,
GFC_RND_MODE);
@@ -3235,6 +3243,22 @@ matmul_lhs_realloc (gfc_expr *c, gfc_expr *a, gfc_expr *b,
get_array_inq_function (GFC_ISYM_SIZE, b, 2));
break;
+ case A2TB2T:
+ /* This can only happen for BLAS, we do not handle that case in
+ inline mamtul. */
+ ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, a, 2);
+ ar->start[1] = get_array_inq_function (GFC_ISYM_SIZE, b, 1);
+
+ ne1 = build_logical_expr (INTRINSIC_NE,
+ get_array_inq_function (GFC_ISYM_SIZE, c, 1),
+ get_array_inq_function (GFC_ISYM_SIZE, a, 2));
+ ne2 = build_logical_expr (INTRINSIC_NE,
+ get_array_inq_function (GFC_ISYM_SIZE, c, 2),
+ get_array_inq_function (GFC_ISYM_SIZE, b, 1));
+
+ cond = build_logical_expr (INTRINSIC_OR, ne1, ne2);
+ break;
+
default:
gcc_unreachable();
@@ -3946,9 +3970,11 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
/* Take care of the inline flag. If the limit check evaluates to a
constant, dead code elimination will eliminate the unneeded branch. */
- if (m_case == A2B2 && flag_inline_matmul_limit > 0)
+ if (flag_inline_matmul_limit > 0 && matrix_a->rank == 2
+ && matrix_b->rank == 2)
{
- if_limit = inline_limit_check (matrix_a, matrix_b, m_case);
+ if_limit = inline_limit_check (matrix_a, matrix_b,
+ flag_inline_matmul_limit);
/* Insert the original statement into the else branch. */
if_limit->block->block->next = co;
@@ -4118,7 +4144,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
switch (m_case)
{
case A2B2:
- inline_limit_check (matrix_a, matrix_b, m_case);
u1 = get_size_m1 (matrix_b, 2);
u2 = get_size_m1 (matrix_a, 2);
@@ -4151,7 +4176,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
break;
case A2B2T:
- inline_limit_check (matrix_a, matrix_b, m_case);
u1 = get_size_m1 (matrix_b, 1);
u2 = get_size_m1 (matrix_a, 2);
@@ -4184,7 +4208,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
break;
case A2TB2:
- inline_limit_check (matrix_a, matrix_b, m_case);
u1 = get_size_m1 (matrix_a, 2);
u2 = get_size_m1 (matrix_b, 2);
@@ -4311,6 +4334,405 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
return 0;
}
+/* Change matmul function calls in the form of
+
+ c = matmul(a,b)
+
+ to the corresponding call to a BLAS routine, if applicable. */
+
+static int
+call_external_blas (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
+ void *data ATTRIBUTE_UNUSED)
+{
+ gfc_code *co, *co_next;
+ gfc_expr *expr1, *expr2;
+ gfc_expr *matrix_a, *matrix_b;
+ gfc_code *if_limit = NULL;
+ gfc_actual_arglist *a, *b;
+ bool conjg_a, conjg_b, transpose_a, transpose_b;
+ gfc_code *call;
+ const char *blas_name;
+ const char *transa, *transb;
+ gfc_expr *c1, *c2, *b1;
+ gfc_actual_arglist *actual, *next;
+ bt type;
+ int kind;
+ enum matrix_case m_case;
+ bool realloc_c;
+ gfc_code **next_code_point;
+
+ /* Many of the tests for inline matmul also apply here. */
+
+ co = *c;
+
+ if (co->op != EXEC_ASSIGN)
+ return 0;
+
+ if (in_where || in_assoc_list)
+ return 0;
+
+ /* The BLOCKS generated for the temporary variables and FORALL don't
+ mix. */
+ if (forall_level > 0)
+ return 0;
+
+ /* For now don't do anything in OpenMP workshare, it confuses
+ its translation, which expects only the allowed statements in there. */
+
+ if (in_omp_workshare)
+ return 0;
+
+ expr1 = co->expr1;
+ expr2 = co->expr2;
+ if (expr2->expr_type != EXPR_FUNCTION
+ || expr2->value.function.isym == NULL
+ || expr2->value.function.isym->id != GFC_ISYM_MATMUL)
+ return 0;
+
+ type = expr2->ts.type;
+ kind = expr2->ts.kind;
+
+ /* Guard against recursion. */
+
+ if (expr2->external_blas)
+ return 0;
+
+ if (type != expr1->ts.type || kind != expr1->ts.kind)
+ return 0;
+
+ if (type == BT_REAL)
+ {
+ if (kind == 4)
+ blas_name = "sgemm";
+ else if (kind == 8)
+ blas_name = "dgemm";
+ else
+ return 0;
+ }
+ else if (type == BT_COMPLEX)
+ {
+ if (kind == 4)
+ blas_name = "cgemm";
+ else if (kind == 8)
+ blas_name = "zgemm";
+ else
+ return 0;
+ }
+ else
+ return 0;
+
+ a = expr2->value.function.actual;
+ if (a->expr->rank != 2)
+ return 0;
+
+ b = a->next;
+ if (b->expr->rank != 2)
+ return 0;
+
+ matrix_a = check_conjg_transpose_variable (a->expr, &conjg_a, &transpose_a);
+ if (matrix_a == NULL)
+ return 0;
+
+ if (transpose_a)
+ {
+ if (conjg_a)
+ transa = "C";
+ else
+ transa = "T";
+ }
+ else
+ transa = "N";
+
+ matrix_b = check_conjg_transpose_variable (b->expr, &conjg_b, &transpose_b);
+ if (matrix_b == NULL)
+ return 0;
+
+ if (transpose_b)
+ {
+ if (conjg_b)
+ transb = "C";
+ else
+ transb = "T";
+ }
+ else
+ transb = "N";
+
+ if (transpose_a)
+ {
+ if (transpose_b)
+ m_case = A2TB2T;
+ else
+ m_case = A2TB2;
+ }
+ else
+ {
+ if (transpose_b)
+ m_case = A2B2T;
+ else
+ m_case = A2B2;
+ }
+
+ current_code = c;
+ inserted_block = NULL;
+ changed_statement = NULL;
+
+ expr2->external_blas = 1;
+
+ /* We do not handle data dependencies yet. */
+ if (gfc_check_dependency (expr1, matrix_a, true)
+ || gfc_check_dependency (expr1, matrix_b, true))
+ return 0;
+
+ /* Generate the if statement and hang it into the tree. */
+ if_limit = inline_limit_check (matrix_a, matrix_b, flag_blas_matmul_limit);
+ co_next = co->next;
+ (*current_code) = if_limit;
+ co->next = NULL;
+ if_limit->block->next = co;
+
+ call = XCNEW (gfc_code);
+ call->loc = co->loc;
+
+ /* Bounds checking - a bit simpler than for inlining since we only
+ have to take care of two-dimensional arrays here. */
+
+ realloc_c = flag_realloc_lhs && gfc_is_reallocatable_lhs (expr1);
+ next_code_point = &(if_limit->block->block->next);
+
+ if (gfc_option.rtcheck & GFC_RTCHECK_BOUNDS)
+ {
+ gfc_code *test;
+ // gfc_expr *a2, *b1, *c1, *c2, *a1, *b2;
+ gfc_expr *c1, *a1, *c2, *b2, *a2;
+ switch (m_case)
+ {
+ case A2B2:
+ b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+ a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+ test = runtime_error_ne (b1, a2, B_ERROR(1));
+ *next_code_point = test;
+ next_code_point = &test->next;
+
+ if (!realloc_c)
+ {
+ c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+ a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+ test = runtime_error_ne (c1, a1, C_ERROR(1));
+ *next_code_point = test;
+ next_code_point = &test->next;
+
+ c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
+ b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+ test = runtime_error_ne (c2, b2, C_ERROR(2));
+ *next_code_point = test;
+ next_code_point = &test->next;
+ }
+ break;
+
+ case A2B2T:
+
+ b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+ a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+ /* matrix_b is transposed, hence dimension 1 for the error message. */
+ test = runtime_error_ne (b2, a2, B_ERROR(1));
+ *next_code_point = test;
+ next_code_point = &test->next;
+
+ if (!realloc_c)
+ {
+ c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+ a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+ test = runtime_error_ne (c1, a1, C_ERROR(1));
+ *next_code_point = test;
+ next_code_point = &test->next;
+
+ c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
+ b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+ test = runtime_error_ne (c2, b1, C_ERROR(2));
+ *next_code_point = test;
+ next_code_point = &test->next;
+ }
+ break;
+
+ case A2TB2:
+
+ b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+ a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+ test = runtime_error_ne (b1, a1, B_ERROR(1));
+ *next_code_point = test;
+ next_code_point = &test->next;
+
+ if (!realloc_c)
+ {
+ c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+ a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+ test = runtime_error_ne (c1, a2, C_ERROR(1));
+ *next_code_point = test;
+ next_code_point = &test->next;
+
+ c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
+ b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+ test = runtime_error_ne (c2, b2, C_ERROR(2));
+ *next_code_point = test;
+ next_code_point = &test->next;
+ }
+ break;
+
+ case A2TB2T:
+ b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+ a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+ test = runtime_error_ne (b2, a1, B_ERROR(1));
+ *next_code_point = test;
+ next_code_point = &test->next;
+
+ if (!realloc_c)
+ {
+ c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+ a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+ test = runtime_error_ne (c1, a2, C_ERROR(1));
+ *next_code_point = test;
+ next_code_point = &test->next;
+
+ c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
+ b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+ test = runtime_error_ne (c2, b1, C_ERROR(2));
+ *next_code_point = test;
+ next_code_point = &test->next;
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ }
+
+ /* Handle the reallocation, if needed. */
+
+ if (realloc_c)
+ {
+ gfc_code *lhs_alloc;
+
+ lhs_alloc = matmul_lhs_realloc (expr1, matrix_a, matrix_b, m_case);
+ *next_code_point = lhs_alloc;
+ next_code_point = &lhs_alloc->next;
+ }
+
+ *next_code_point = call;
+ if_limit->next = co_next;
+
+ /* Set up the BLAS call. */
+
+ call->op = EXEC_CALL;
+
+ gfc_get_sym_tree (blas_name, current_ns, &(call->symtree), true);
+ call->symtree->n.sym->attr.subroutine = 1;
+ call->symtree->n.sym->attr.procedure = 1;
+ call->symtree->n.sym->attr.flavor = FL_PROCEDURE;
+ call->resolved_sym = call->symtree->n.sym;
+
+ /* Argument TRANSA. */
+ next = gfc_get_actual_arglist ();
+ next->expr = gfc_get_character_expr (gfc_default_character_kind, &co->loc,
+ transa, 1);
+
+ call->ext.actual = next;
+
+ /* Argument TRANSB. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = gfc_get_character_expr (gfc_default_character_kind, &co->loc,
+ transb, 1);
+ actual->next = next;
+
+ c1 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (a->expr), 1,
+ gfc_integer_4_kind);
+ c2 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (b->expr), 2,
+ gfc_integer_4_kind);
+
+ b1 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (b->expr), 1,
+ gfc_integer_4_kind);
+
+ /* Argument M. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = c1;
+ actual->next = next;
+
+ /* Argument N. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = c2;
+ actual->next = next;
+
+ /* Argument K. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = b1;
+ actual->next = next;
+
+ /* Argument ALPHA - set to one. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = gfc_get_constant_expr (type, kind, &co->loc);
+ if (type == BT_REAL)
+ mpfr_set_ui (next->expr->value.real, 1, GFC_RND_MODE);
+ else
+ mpc_set_ui (next->expr->value.complex, 1, GFC_MPC_RND_MODE);
+ actual->next = next;
+
+ /* Argument A. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = gfc_copy_expr (matrix_a);
+ actual->next = next;
+
+ /* Argument LDA. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (matrix_a),
+ 1, gfc_integer_4_kind);
+ actual->next = next;
+
+ /* Argument B. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = gfc_copy_expr (matrix_b);
+ actual->next = next;
+
+ /* Argument LDB. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (matrix_b),
+ 1, gfc_integer_4_kind);
+ actual->next = next;
+
+ /* Argument BETA - set to zero. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = gfc_get_constant_expr (type, kind, &co->loc);
+ if (type == BT_REAL)
+ mpfr_set_ui (next->expr->value.real, 0, GFC_RND_MODE);
+ else
+ mpc_set_ui (next->expr->value.complex, 0, GFC_MPC_RND_MODE);
+ actual->next = next;
+
+ /* Argument C. */
+
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = gfc_copy_expr (expr1);
+ actual->next = next;
+
+ /* Argument LDC. */
+ actual = next;
+ next = gfc_get_actual_arglist ();
+ next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (expr1),
+ 1, gfc_integer_4_kind);
+ actual->next = next;
+
+ return 0;
+}
+
/* Code for index interchange for loops which are grouped together in DO
CONCURRENT or FORALL statements. This is currently only applied if the