diff options
author | Thomas Koenig <tkoenig@gcc.gnu.org> | 2018-09-18 20:18:09 +0000 |
---|---|---|
committer | Thomas Koenig <tkoenig@gcc.gnu.org> | 2018-09-18 20:18:09 +0000 |
commit | 998511a6100212931d039e3a46403d2d878c8e5a (patch) | |
tree | 70e69614c308f81a76996bb4aac9fafffce58837 /gcc/fortran/frontend-passes.c | |
parent | 5c470e0f07563c3ab540c0a9f40b8a6ac4a29f07 (diff) | |
download | gcc-998511a6100212931d039e3a46403d2d878c8e5a.zip gcc-998511a6100212931d039e3a46403d2d878c8e5a.tar.gz gcc-998511a6100212931d039e3a46403d2d878c8e5a.tar.bz2 |
re PR fortran/29550 (Optimize -fexternal-blas calls for conjg())
2018-09-18 Thomas Koenig <tkoenig@gcc.gnu.org>
PR fortran/29550
* gfortran.h (gfc_expr): Add external_blas flag.
* frontend-passes.c (matrix_case): Add case A2TB2T.
(optimize_namespace): Handle flag_external_blas by
calling call_external_blas.
(get_array_inq_function): Add argument okind. If
it is nonzero, use it as the kind of argument
to be used.
(inline_limit_check): Remove m_case argument, add
limit argument instead. Remove assert about m_case.
Set the limit for inlining from the limit argument.
(matmul_lhs_realloc): Handle case A2TB2T.
(inline_matmul_assign): Handle inline limit for other cases with
two rank-two matrices. Remove no-op calls to inline_limit_check.
(call_external_blas): New function.
* trans-intrinsic.c (gfc_conv_intrinsic_funcall): Do not add
argument to external BLAS if external_blas is already set.
2018-09-18 Thomas Koenig <tkoenig@gcc.gnu.org>
PR fortran/29550
* gfortran.dg/inline_matmul_13.f90: Adjust count for
_gfortran_matmul.
* gfortran.dg/inline_matmul_16.f90: Likewise.
* gfortran.dg/promotion_2.f90: Add -fblas-matmul-limit=1. Scan
for dgemm instead of dgemm_. Add call to random_number to make
standard conforming.
* gfortran.dg/matmul_blas_1.f90: New test.
* gfortran.dg/matmul_bounds_14.f: New test.
* gfortran.dg/matmul_bounds_15.f: New test.
* gfortran.dg/matmul_bounds_16.f: New test.
* gfortran.dg/blas_gemm_routines.f: New test / additional file for
preceding tests.
From-SVN: r264412
Diffstat (limited to 'gcc/fortran/frontend-passes.c')
-rw-r--r-- | gcc/fortran/frontend-passes.c | 456 |
1 files changed, 439 insertions, 17 deletions
diff --git a/gcc/fortran/frontend-passes.c b/gcc/fortran/frontend-passes.c index 80a65fc..2a65b52 100644 --- a/gcc/fortran/frontend-passes.c +++ b/gcc/fortran/frontend-passes.c @@ -53,6 +53,7 @@ static gfc_code * create_do_loop (gfc_expr *, gfc_expr *, gfc_expr *, char *vname=NULL); static gfc_expr* check_conjg_transpose_variable (gfc_expr *, bool *, bool *); +static int call_external_blas (gfc_code **, int *, void *); static bool has_dimen_vector_ref (gfc_expr *); static int matmul_temp_args (gfc_code **, int *,void *data); static int index_interchange (gfc_code **, int*, void *); @@ -131,7 +132,7 @@ static int var_num = 1; /* What sort of matrix we are dealing with when inlining MATMUL. */ -enum matrix_case { none=0, A2B2, A2B1, A1B2, A2B2T, A2TB2 }; +enum matrix_case { none=0, A2B2, A2B1, A1B2, A2B2T, A2TB2, A2TB2T }; /* Keep track of the number of expressions we have inserted so far using create_var. */ @@ -1428,7 +1429,7 @@ optimize_namespace (gfc_namespace *ns) gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL); gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL); gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL); - if (flag_inline_matmul_limit != 0) + if (flag_inline_matmul_limit != 0 || flag_external_blas) { bool found; do @@ -1441,9 +1442,15 @@ optimize_namespace (gfc_namespace *ns) gfc_code_walker (&ns->code, matmul_temp_args, dummy_expr_callback, NULL); - gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback, - NULL); } + + if (flag_external_blas) + gfc_code_walker (&ns->code, call_external_blas, dummy_expr_callback, + NULL); + + if (flag_inline_matmul_limit != 0) + gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback, + NULL); } if (flag_frontend_loop_interchange) @@ -2938,7 +2945,7 @@ matmul_temp_args (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED, dim is zero-based. */ static gfc_expr * -get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim) +get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim, int okind = 0) { gfc_expr *fcn; gfc_expr *dim_arg, *kind; @@ -2964,8 +2971,12 @@ get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim) } dim_arg = gfc_get_int_expr (gfc_default_integer_kind, &e->where, dim); - kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where, - gfc_index_integer_kind); + if (okind != 0) + kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where, + okind); + else + kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where, + gfc_index_integer_kind); ec = gfc_copy_expr (e); @@ -3026,7 +3037,7 @@ get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2) removed by DCE. Only called for rank-two matrices A and B. */ static gfc_code * -inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case) +inline_limit_check (gfc_expr *a, gfc_expr *b, int limit) { gfc_expr *inline_limit; gfc_code *if_1, *if_2, *else_2; @@ -3034,14 +3045,11 @@ inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case) gfc_typespec ts; gfc_expr *cond; - gcc_assert (m_case == A2B2 || m_case == A2B2T || m_case == A2TB2); - /* Calculation is done in real to avoid integer overflow. */ inline_limit = gfc_get_constant_expr (BT_REAL, gfc_default_real_kind, &a->where); - mpfr_set_si (inline_limit->value.real, flag_inline_matmul_limit, - GFC_RND_MODE); + mpfr_set_si (inline_limit->value.real, limit, GFC_RND_MODE); mpfr_pow_ui (inline_limit->value.real, inline_limit->value.real, 3, GFC_RND_MODE); @@ -3235,6 +3243,22 @@ matmul_lhs_realloc (gfc_expr *c, gfc_expr *a, gfc_expr *b, get_array_inq_function (GFC_ISYM_SIZE, b, 2)); break; + case A2TB2T: + /* This can only happen for BLAS, we do not handle that case in + inline mamtul. */ + ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, a, 2); + ar->start[1] = get_array_inq_function (GFC_ISYM_SIZE, b, 1); + + ne1 = build_logical_expr (INTRINSIC_NE, + get_array_inq_function (GFC_ISYM_SIZE, c, 1), + get_array_inq_function (GFC_ISYM_SIZE, a, 2)); + ne2 = build_logical_expr (INTRINSIC_NE, + get_array_inq_function (GFC_ISYM_SIZE, c, 2), + get_array_inq_function (GFC_ISYM_SIZE, b, 1)); + + cond = build_logical_expr (INTRINSIC_OR, ne1, ne2); + break; + default: gcc_unreachable(); @@ -3946,9 +3970,11 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees, /* Take care of the inline flag. If the limit check evaluates to a constant, dead code elimination will eliminate the unneeded branch. */ - if (m_case == A2B2 && flag_inline_matmul_limit > 0) + if (flag_inline_matmul_limit > 0 && matrix_a->rank == 2 + && matrix_b->rank == 2) { - if_limit = inline_limit_check (matrix_a, matrix_b, m_case); + if_limit = inline_limit_check (matrix_a, matrix_b, + flag_inline_matmul_limit); /* Insert the original statement into the else branch. */ if_limit->block->block->next = co; @@ -4118,7 +4144,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees, switch (m_case) { case A2B2: - inline_limit_check (matrix_a, matrix_b, m_case); u1 = get_size_m1 (matrix_b, 2); u2 = get_size_m1 (matrix_a, 2); @@ -4151,7 +4176,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees, break; case A2B2T: - inline_limit_check (matrix_a, matrix_b, m_case); u1 = get_size_m1 (matrix_b, 1); u2 = get_size_m1 (matrix_a, 2); @@ -4184,7 +4208,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees, break; case A2TB2: - inline_limit_check (matrix_a, matrix_b, m_case); u1 = get_size_m1 (matrix_a, 2); u2 = get_size_m1 (matrix_b, 2); @@ -4311,6 +4334,405 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees, return 0; } +/* Change matmul function calls in the form of + + c = matmul(a,b) + + to the corresponding call to a BLAS routine, if applicable. */ + +static int +call_external_blas (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED, + void *data ATTRIBUTE_UNUSED) +{ + gfc_code *co, *co_next; + gfc_expr *expr1, *expr2; + gfc_expr *matrix_a, *matrix_b; + gfc_code *if_limit = NULL; + gfc_actual_arglist *a, *b; + bool conjg_a, conjg_b, transpose_a, transpose_b; + gfc_code *call; + const char *blas_name; + const char *transa, *transb; + gfc_expr *c1, *c2, *b1; + gfc_actual_arglist *actual, *next; + bt type; + int kind; + enum matrix_case m_case; + bool realloc_c; + gfc_code **next_code_point; + + /* Many of the tests for inline matmul also apply here. */ + + co = *c; + + if (co->op != EXEC_ASSIGN) + return 0; + + if (in_where || in_assoc_list) + return 0; + + /* The BLOCKS generated for the temporary variables and FORALL don't + mix. */ + if (forall_level > 0) + return 0; + + /* For now don't do anything in OpenMP workshare, it confuses + its translation, which expects only the allowed statements in there. */ + + if (in_omp_workshare) + return 0; + + expr1 = co->expr1; + expr2 = co->expr2; + if (expr2->expr_type != EXPR_FUNCTION + || expr2->value.function.isym == NULL + || expr2->value.function.isym->id != GFC_ISYM_MATMUL) + return 0; + + type = expr2->ts.type; + kind = expr2->ts.kind; + + /* Guard against recursion. */ + + if (expr2->external_blas) + return 0; + + if (type != expr1->ts.type || kind != expr1->ts.kind) + return 0; + + if (type == BT_REAL) + { + if (kind == 4) + blas_name = "sgemm"; + else if (kind == 8) + blas_name = "dgemm"; + else + return 0; + } + else if (type == BT_COMPLEX) + { + if (kind == 4) + blas_name = "cgemm"; + else if (kind == 8) + blas_name = "zgemm"; + else + return 0; + } + else + return 0; + + a = expr2->value.function.actual; + if (a->expr->rank != 2) + return 0; + + b = a->next; + if (b->expr->rank != 2) + return 0; + + matrix_a = check_conjg_transpose_variable (a->expr, &conjg_a, &transpose_a); + if (matrix_a == NULL) + return 0; + + if (transpose_a) + { + if (conjg_a) + transa = "C"; + else + transa = "T"; + } + else + transa = "N"; + + matrix_b = check_conjg_transpose_variable (b->expr, &conjg_b, &transpose_b); + if (matrix_b == NULL) + return 0; + + if (transpose_b) + { + if (conjg_b) + transb = "C"; + else + transb = "T"; + } + else + transb = "N"; + + if (transpose_a) + { + if (transpose_b) + m_case = A2TB2T; + else + m_case = A2TB2; + } + else + { + if (transpose_b) + m_case = A2B2T; + else + m_case = A2B2; + } + + current_code = c; + inserted_block = NULL; + changed_statement = NULL; + + expr2->external_blas = 1; + + /* We do not handle data dependencies yet. */ + if (gfc_check_dependency (expr1, matrix_a, true) + || gfc_check_dependency (expr1, matrix_b, true)) + return 0; + + /* Generate the if statement and hang it into the tree. */ + if_limit = inline_limit_check (matrix_a, matrix_b, flag_blas_matmul_limit); + co_next = co->next; + (*current_code) = if_limit; + co->next = NULL; + if_limit->block->next = co; + + call = XCNEW (gfc_code); + call->loc = co->loc; + + /* Bounds checking - a bit simpler than for inlining since we only + have to take care of two-dimensional arrays here. */ + + realloc_c = flag_realloc_lhs && gfc_is_reallocatable_lhs (expr1); + next_code_point = &(if_limit->block->block->next); + + if (gfc_option.rtcheck & GFC_RTCHECK_BOUNDS) + { + gfc_code *test; + // gfc_expr *a2, *b1, *c1, *c2, *a1, *b2; + gfc_expr *c1, *a1, *c2, *b2, *a2; + switch (m_case) + { + case A2B2: + b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1); + a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2); + test = runtime_error_ne (b1, a2, B_ERROR(1)); + *next_code_point = test; + next_code_point = &test->next; + + if (!realloc_c) + { + c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1); + a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1); + test = runtime_error_ne (c1, a1, C_ERROR(1)); + *next_code_point = test; + next_code_point = &test->next; + + c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2); + b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2); + test = runtime_error_ne (c2, b2, C_ERROR(2)); + *next_code_point = test; + next_code_point = &test->next; + } + break; + + case A2B2T: + + b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2); + a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2); + /* matrix_b is transposed, hence dimension 1 for the error message. */ + test = runtime_error_ne (b2, a2, B_ERROR(1)); + *next_code_point = test; + next_code_point = &test->next; + + if (!realloc_c) + { + c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1); + a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1); + test = runtime_error_ne (c1, a1, C_ERROR(1)); + *next_code_point = test; + next_code_point = &test->next; + + c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2); + b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1); + test = runtime_error_ne (c2, b1, C_ERROR(2)); + *next_code_point = test; + next_code_point = &test->next; + } + break; + + case A2TB2: + + b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1); + a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1); + test = runtime_error_ne (b1, a1, B_ERROR(1)); + *next_code_point = test; + next_code_point = &test->next; + + if (!realloc_c) + { + c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1); + a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2); + test = runtime_error_ne (c1, a2, C_ERROR(1)); + *next_code_point = test; + next_code_point = &test->next; + + c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2); + b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2); + test = runtime_error_ne (c2, b2, C_ERROR(2)); + *next_code_point = test; + next_code_point = &test->next; + } + break; + + case A2TB2T: + b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2); + a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1); + test = runtime_error_ne (b2, a1, B_ERROR(1)); + *next_code_point = test; + next_code_point = &test->next; + + if (!realloc_c) + { + c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1); + a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2); + test = runtime_error_ne (c1, a2, C_ERROR(1)); + *next_code_point = test; + next_code_point = &test->next; + + c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2); + b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1); + test = runtime_error_ne (c2, b1, C_ERROR(2)); + *next_code_point = test; + next_code_point = &test->next; + } + break; + + default: + gcc_unreachable (); + } + } + + /* Handle the reallocation, if needed. */ + + if (realloc_c) + { + gfc_code *lhs_alloc; + + lhs_alloc = matmul_lhs_realloc (expr1, matrix_a, matrix_b, m_case); + *next_code_point = lhs_alloc; + next_code_point = &lhs_alloc->next; + } + + *next_code_point = call; + if_limit->next = co_next; + + /* Set up the BLAS call. */ + + call->op = EXEC_CALL; + + gfc_get_sym_tree (blas_name, current_ns, &(call->symtree), true); + call->symtree->n.sym->attr.subroutine = 1; + call->symtree->n.sym->attr.procedure = 1; + call->symtree->n.sym->attr.flavor = FL_PROCEDURE; + call->resolved_sym = call->symtree->n.sym; + + /* Argument TRANSA. */ + next = gfc_get_actual_arglist (); + next->expr = gfc_get_character_expr (gfc_default_character_kind, &co->loc, + transa, 1); + + call->ext.actual = next; + + /* Argument TRANSB. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = gfc_get_character_expr (gfc_default_character_kind, &co->loc, + transb, 1); + actual->next = next; + + c1 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (a->expr), 1, + gfc_integer_4_kind); + c2 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (b->expr), 2, + gfc_integer_4_kind); + + b1 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (b->expr), 1, + gfc_integer_4_kind); + + /* Argument M. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = c1; + actual->next = next; + + /* Argument N. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = c2; + actual->next = next; + + /* Argument K. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = b1; + actual->next = next; + + /* Argument ALPHA - set to one. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = gfc_get_constant_expr (type, kind, &co->loc); + if (type == BT_REAL) + mpfr_set_ui (next->expr->value.real, 1, GFC_RND_MODE); + else + mpc_set_ui (next->expr->value.complex, 1, GFC_MPC_RND_MODE); + actual->next = next; + + /* Argument A. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = gfc_copy_expr (matrix_a); + actual->next = next; + + /* Argument LDA. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (matrix_a), + 1, gfc_integer_4_kind); + actual->next = next; + + /* Argument B. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = gfc_copy_expr (matrix_b); + actual->next = next; + + /* Argument LDB. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (matrix_b), + 1, gfc_integer_4_kind); + actual->next = next; + + /* Argument BETA - set to zero. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = gfc_get_constant_expr (type, kind, &co->loc); + if (type == BT_REAL) + mpfr_set_ui (next->expr->value.real, 0, GFC_RND_MODE); + else + mpc_set_ui (next->expr->value.complex, 0, GFC_MPC_RND_MODE); + actual->next = next; + + /* Argument C. */ + + actual = next; + next = gfc_get_actual_arglist (); + next->expr = gfc_copy_expr (expr1); + actual->next = next; + + /* Argument LDC. */ + actual = next; + next = gfc_get_actual_arglist (); + next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (expr1), + 1, gfc_integer_4_kind); + actual->next = next; + + return 0; +} + /* Code for index interchange for loops which are grouped together in DO CONCURRENT or FORALL statements. This is currently only applied if the |