From 77e6870d7fc85a4139f7f27ee3287b103d374ab2 Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Thu, 1 Dec 2016 06:06:04 +0000 Subject: crossconfig.m4 (*-linux*): Add link-check for memalign. 2016-12-01 Bernd Edlinger * crossconfig.m4 (*-linux*): Add link-check for memalign. * configure: Regenerated. From-SVN: r243095 --- libstdc++-v3/ChangeLog | 7 ++++++- libstdc++-v3/configure | 13 +++++++++++++ libstdc++-v3/crossconfig.m4 | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index cd4f5ae..b5f94c0 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,4 +1,9 @@ -2016-11-31 Tim Shen +2016-12-01 Bernd Edlinger + + * crossconfig.m4 (*-linux*): Add link-check for memalign. + * configure: Regenerated. + +2016-12-01 Tim Shen PR libstdc++/71500 * include/bits/regex.h (basic_regex::basic_regex): Use ECMAScript diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure index f32197e..1f72e3f 100755 --- a/libstdc++-v3/configure +++ b/libstdc++-v3/configure @@ -59826,6 +59826,19 @@ _ACEOF fi done + for ac_func in aligned_alloc posix_memalign memalign _aligned_malloc +do : + as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` +ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" +eval as_val=\$$as_ac_var + if test "x$as_val" = x""yes; then : + cat >>confdefs.h <<_ACEOF +#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1 +_ACEOF + +fi +done + diff --git a/libstdc++-v3/crossconfig.m4 b/libstdc++-v3/crossconfig.m4 index 6abc84f..4eaf208 100644 --- a/libstdc++-v3/crossconfig.m4 +++ b/libstdc++-v3/crossconfig.m4 @@ -157,6 +157,7 @@ case "${host}" in AC_DEFINE(_GLIBCXX_USE_RANDOM_TR1) GCC_CHECK_TLS AC_CHECK_FUNCS(__cxa_thread_atexit_impl) + AC_CHECK_FUNCS(aligned_alloc posix_memalign memalign _aligned_malloc) AM_ICONV ;; *-mingw32*) -- cgit v1.1 From f73ee211c30c292316ffa6f55eca3531c67a4de7 Mon Sep 17 00:00:00 2001 From: Alan Modra Date: Thu, 1 Dec 2016 17:44:04 +1030 Subject: [RS6000] fix rtl-checking internal compiler error * gcc/config/rs6000/rs6000.c (insn_is_swappable_p): Properly look inside UNSPEC_VSX_XXSPLTW vec. From-SVN: r243097 --- gcc/ChangeLog | 5 +++++ gcc/config/rs6000/rs6000.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e2f7b25..e36f9d8 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 Alan Modra + + * gcc/config/rs6000/rs6000.c (insn_is_swappable_p): Properly + look inside UNSPEC_VSX_XXSPLTW vec. + 2016-12-01 Segher Boessenkool PR rtl-optimization/78607 diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 9fe98b7..7f307b1 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -40675,7 +40675,7 @@ insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn, if (GET_CODE (use_body) != SET || GET_CODE (SET_SRC (use_body)) != UNSPEC || XINT (SET_SRC (use_body), 1) != UNSPEC_VSX_XXSPLTW - || XEXP (XEXP (SET_SRC (use_body), 0), 1) != const0_rtx) + || XVECEXP (SET_SRC (use_body), 0, 1) != const0_rtx) return 0; } } -- cgit v1.1 From d9b2d86c74a2153b07fb9d63baf220d385043a9d Mon Sep 17 00:00:00 2001 From: Ville Voutilainen Date: Thu, 1 Dec 2016 09:14:19 +0200 Subject: The convertible_to traits need to use a variadic catch-all for the false-cases. The convertible_to traits need to use a variadic catch-all for the false-cases. * include/std/istream (__is_convertible_to_basic_istream): Change the parameter of the false-case of __check to a variadic. * include/std/ostream (__is_convertible_to_basic_ostream): Likewise. From-SVN: r243098 --- libstdc++-v3/ChangeLog | 9 +++++++++ libstdc++-v3/include/std/istream | 2 +- libstdc++-v3/include/std/ostream | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index b5f94c0..cc0b3ae 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,12 @@ +2016-12-01 Ville Voutilainen + + The convertible_to traits need to use a variadic catch-all for the + false-cases. + * include/std/istream (__is_convertible_to_basic_istream): + Change the parameter of the false-case of __check to a variadic. + * include/std/ostream (__is_convertible_to_basic_ostream): + Likewise. + 2016-12-01 Bernd Edlinger * crossconfig.m4 (*-linux*): Add link-check for memalign. diff --git a/libstdc++-v3/include/std/istream b/libstdc++-v3/include/std/istream index 319e226..1d77d30 100644 --- a/libstdc++-v3/include/std/istream +++ b/libstdc++-v3/include/std/istream @@ -915,7 +915,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template static basic_istream<_Ch, _Up>& __check(basic_istream<_Ch, _Up>*); - static void __check(void*); + static void __check(...); public: using istream_type = decltype(__check(declval::type*>())); diff --git a/libstdc++-v3/include/std/ostream b/libstdc++-v3/include/std/ostream index 70fd10b..9dea778 100644 --- a/libstdc++-v3/include/std/ostream +++ b/libstdc++-v3/include/std/ostream @@ -619,7 +619,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template static basic_ostream<_Ch, _Up>& __check(basic_ostream<_Ch, _Up>*); - static void __check(void*); + static void __check(...); public: using ostream_type = decltype(__check(declval::type*>())); -- cgit v1.1 From 43d0b501eec49d6d4092fe0e5299aedf1d743124 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 1 Dec 2016 08:56:49 +0100 Subject: re PR target/78614 (ICE error: invalid rtl sharing found in the insn (verify_rtx_sharing) gcc/emit-rtl.c:2743) PR target/78614 * config/rs6000/rs6000.c (rs6000_frame_related): Call set_used_flags (pat) before any simplifications. Clear used flag on PARALLEL copy. Don't guard add_reg_note call. Call copy_rtx_if_shared on pat before storing it into REG_FRAME_RELATED_EXPR. From-SVN: r243099 --- gcc/ChangeLog | 9 +++++++++ gcc/config/rs6000/rs6000.c | 5 +++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e36f9d8..b3cc6305 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2016-12-01 Jakub Jelinek + + PR target/78614 + * config/rs6000/rs6000.c (rs6000_frame_related): Call + set_used_flags (pat) before any simplifications. Clear used flag on + PARALLEL copy. Don't guard add_reg_note call. Call + copy_rtx_if_shared on pat before storing it into + REG_FRAME_RELATED_EXPR. + 2016-12-01 Alan Modra * gcc/config/rs6000/rs6000.c (insn_is_swappable_p): Properly diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 7f307b1..e572620 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -27174,6 +27174,7 @@ rs6000_frame_related (rtx_insn *insn, rtx reg, HOST_WIDE_INT val, Call simplify_replace_rtx on the SETs rather than the whole insn so as to leave the other stuff alone (for example USE of r12). */ + set_used_flags (pat); if (GET_CODE (pat) == SET) { if (repl) @@ -27185,6 +27186,7 @@ rs6000_frame_related (rtx_insn *insn, rtx reg, HOST_WIDE_INT val, { pat = shallow_copy_rtx (pat); XVEC (pat, 0) = shallow_copy_rtvec (XVEC (pat, 0)); + RTX_FLAG (pat, used) = 0; for (int i = 0; i < XVECLEN (pat, 0); i++) if (GET_CODE (XVECEXP (pat, 0, i)) == SET) @@ -27207,8 +27209,7 @@ rs6000_frame_related (rtx_insn *insn, rtx reg, HOST_WIDE_INT val, gcc_unreachable (); RTX_FRAME_RELATED_P (insn) = 1; - if (repl || reg2) - add_reg_note (insn, REG_FRAME_RELATED_EXPR, pat); + add_reg_note (insn, REG_FRAME_RELATED_EXPR, copy_rtx_if_shared (pat)); return insn; } -- cgit v1.1 From d26b3eb7658b48e6dadec752755f864652f19591 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 1 Dec 2016 10:24:55 +0100 Subject: re PR debug/78587 (dwarf2out.c:1517:45: runtime error: negation of -9223372036854775808 cannot be represented in type 'long int [4]'; cast to an unsigned type to negate this value to itself) PR debug/78587 * dwarf2out.c (loc_descr_plus_const): For negative offset use uint_loc_descriptor instead of int_loc_descriptor and perform negation in unsigned HOST_WIDE_INT type. (scompare_loc_descriptor): Shift UINTVAL left instead of INTVAL. * gcc.dg/debug/pr78587.c: New test. From-SVN: r243100 --- gcc/ChangeLog | 6 ++++++ gcc/dwarf2out.c | 7 ++++--- gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/gcc.dg/debug/pr78587.c | 23 +++++++++++++++++++++++ 4 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/debug/pr78587.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b3cc6305..ef945b1 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,11 @@ 2016-12-01 Jakub Jelinek + PR debug/78587 + * dwarf2out.c (loc_descr_plus_const): For negative offset use + uint_loc_descriptor instead of int_loc_descriptor and perform negation + in unsigned HOST_WIDE_INT type. + (scompare_loc_descriptor): Shift UINTVAL left instead of INTVAL. + PR target/78614 * config/rs6000/rs6000.c (rs6000_frame_related): Call set_used_flags (pat) before any simplifications. Clear used flag on diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c index 66a4919..bc328ab 100644 --- a/gcc/dwarf2out.c +++ b/gcc/dwarf2out.c @@ -1514,7 +1514,8 @@ loc_descr_plus_const (dw_loc_descr_ref *list_head, HOST_WIDE_INT offset) else { - loc->dw_loc_next = int_loc_descriptor (-offset); + loc->dw_loc_next + = uint_loc_descriptor (-(unsigned HOST_WIDE_INT) offset); add_loc_descr (&loc->dw_loc_next, new_loc_descr (DW_OP_minus, 0, 0)); } } @@ -13837,7 +13838,7 @@ scompare_loc_descriptor (enum dwarf_location_atom op, rtx rtl, if (CONST_INT_P (XEXP (rtl, 1)) && GET_MODE_BITSIZE (op_mode) < HOST_BITS_PER_WIDE_INT && (size_of_int_loc_descriptor (shift) + 1 - + size_of_int_loc_descriptor (INTVAL (XEXP (rtl, 1)) << shift) + + size_of_int_loc_descriptor (UINTVAL (XEXP (rtl, 1)) << shift) >= size_of_int_loc_descriptor (GET_MODE_MASK (op_mode)) + 1 + size_of_int_loc_descriptor (INTVAL (XEXP (rtl, 1)) & GET_MODE_MASK (op_mode)))) @@ -13852,7 +13853,7 @@ scompare_loc_descriptor (enum dwarf_location_atom op, rtx rtl, add_loc_descr (&op0, int_loc_descriptor (shift)); add_loc_descr (&op0, new_loc_descr (DW_OP_shl, 0, 0)); if (CONST_INT_P (XEXP (rtl, 1))) - op1 = int_loc_descriptor (INTVAL (XEXP (rtl, 1)) << shift); + op1 = int_loc_descriptor (UINTVAL (XEXP (rtl, 1)) << shift); else { add_loc_descr (&op1, int_loc_descriptor (shift)); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index bf4bd8a..2d1c182 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 Jakub Jelinek + + PR debug/78587 + * gcc.dg/debug/pr78587.c: New test. + 2016-12-01 Segher Boessenkool PR rtl-optimization/78607 diff --git a/gcc/testsuite/gcc.dg/debug/pr78587.c b/gcc/testsuite/gcc.dg/debug/pr78587.c new file mode 100644 index 0000000..b368a2a --- /dev/null +++ b/gcc/testsuite/gcc.dg/debug/pr78587.c @@ -0,0 +1,23 @@ +/* PR debug/78587 */ +/* { dg-do compile } */ +/* { dg-additional-options "-w" } */ + +extern void bar (void); + +void +foo (long long x) +{ + x ^= 9223372036854775808ULL; + bar (); +} + +struct S { int w[4]; } a[1], b; + +void +baz () +{ + int e = (int) baz; + if (e <= -80) + e = 0; + b = a[e]; +} -- cgit v1.1 From ccbf6355186e78ac85245962be0d33c0c4c28ac6 Mon Sep 17 00:00:00 2001 From: Andre Vehreschild Date: Thu, 1 Dec 2016 10:53:25 +0100 Subject: coarray_lib_alloc_4.f90: Fix for 32-bits. gcc/testsuite/ChangeLog: 2016-12-01 Andre Vehreschild * gfortran.dg/coarray_lib_alloc_4.f90: Fix for 32-bits. From-SVN: r243101 --- gcc/testsuite/ChangeLog | 4 ++++ gcc/testsuite/gfortran.dg/coarray_lib_alloc_4.f90 | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 2d1c182..98a14c6 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2016-12-01 Andre Vehreschild + + * gfortran.dg/coarray_lib_alloc_4.f90: Fix for 32-bits. + 2016-12-01 Jakub Jelinek PR debug/78587 diff --git a/gcc/testsuite/gfortran.dg/coarray_lib_alloc_4.f90 b/gcc/testsuite/gfortran.dg/coarray_lib_alloc_4.f90 index aea9fbf..7b72707 100644 --- a/gcc/testsuite/gfortran.dg/coarray_lib_alloc_4.f90 +++ b/gcc/testsuite/gfortran.dg/coarray_lib_alloc_4.f90 @@ -38,7 +38,7 @@ program test_caf_alloc end ! { dg-final { scan-tree-dump-times "_gfortran_caf_is_present \\(xx\\.token, 2 - \\(integer\\(kind=4\\)\\) xx\\.dim\\\[0\\\]\\.lbound, &caf_ref\\.\[0-9\]+\\)|_gfortran_caf_is_present \\(xx\\.token, 2 - xx\\.dim\\\[0\\\]\\.lbound, &caf_ref\\.\[0-9\]+\\)" 10 "original" } } -! { dg-final { scan-tree-dump-times "_gfortran_caf_register \\(72, 1, &xx\\.token, \\(void \\*\\) &xx, 0B, 0B, 0\\)" 1 "original" } } +! { dg-final { scan-tree-dump-times "_gfortran_caf_register \\(\[0-9\]+, 1, &xx\\.token, \\(void \\*\\) &xx, 0B, 0B, 0\\)" 1 "original" } } ! { dg-final { scan-tree-dump-times "_gfortran_caf_register \\(\[0-9\]+, 7" 2 "original" } } ! { dg-final { scan-tree-dump-times "_gfortran_caf_register \\(\[0-9\]+, 8" 2 "original" } } ! { dg-final { scan-tree-dump-times "_gfortran_caf_deregister \\(&xx\\.token, 0, 0B, 0B, 0\\)" 1 "original" } } -- cgit v1.1 From 1636ebdc8a21300aa5aa6b8cfd423d397394c7a5 Mon Sep 17 00:00:00 2001 From: Georg-Johann Lay Date: Thu, 1 Dec 2016 10:09:56 +0000 Subject: avr.c (avr_print_operand): Use SYMBOL_REF_P if possible. gcc/ * config/avr/avr.c (avr_print_operand): Use SYMBOL_REF_P if possible. (avr_handle_addr_attribute, avr_asm_output_aligned_decl_common) (avr_asm_asm_output_aligned_bss, avr_addr_space_convert): Dito. From-SVN: r243104 --- gcc/ChangeLog | 6 ++++++ gcc/config/avr/avr.c | 10 +++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ef945b1..2683757 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2016-12-01 Georg-Johann Lay + + * config/avr/avr.c (avr_print_operand): Use SYMBOL_REF_P if possible. + (avr_handle_addr_attribute, avr_asm_output_aligned_decl_common) + (avr_asm_asm_output_aligned_bss, avr_addr_space_convert): Dito. + 2016-12-01 Jakub Jelinek PR debug/78587 diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c index d0c54c2..db3c55f 100644 --- a/gcc/config/avr/avr.c +++ b/gcc/config/avr/avr.c @@ -2726,7 +2726,7 @@ avr_print_operand (FILE *file, rtx x, int code) } else if (code == 'i') { - if (GET_CODE (x) == SYMBOL_REF && (SYMBOL_REF_FLAGS (x) & SYMBOL_FLAG_IO)) + if (SYMBOL_REF_P (x) && (SYMBOL_REF_FLAGS (x) & SYMBOL_FLAG_IO)) avr_print_operand_address (file, VOIDmode, plus_constant (HImode, x, -avr_arch->sfr_offset)); else @@ -9585,7 +9585,7 @@ avr_handle_addr_attribute (tree *node, tree name, tree args, rtx avr_eval_addr_attrib (rtx x) { - if (GET_CODE (x) == SYMBOL_REF + if (SYMBOL_REF_P (x) && (SYMBOL_REF_FLAGS (x) & SYMBOL_FLAG_ADDRESS)) { tree decl = SYMBOL_REF_DECL (x); @@ -9896,7 +9896,7 @@ avr_asm_output_aligned_decl_common (FILE * stream, rtx symbol; if (mem != NULL_RTX && MEM_P (mem) - && GET_CODE ((symbol = XEXP (mem, 0))) == SYMBOL_REF + && SYMBOL_REF_P ((symbol = XEXP (mem, 0))) && (SYMBOL_REF_FLAGS (symbol) & (SYMBOL_FLAG_IO | SYMBOL_FLAG_ADDRESS))) { @@ -9941,7 +9941,7 @@ avr_asm_asm_output_aligned_bss (FILE *file, tree decl, const char *name, rtx symbol; if (mem != NULL_RTX && MEM_P (mem) - && GET_CODE ((symbol = XEXP (mem, 0))) == SYMBOL_REF + && SYMBOL_REF_P ((symbol = XEXP (mem, 0))) && (SYMBOL_REF_FLAGS (symbol) & (SYMBOL_FLAG_IO | SYMBOL_FLAG_ADDRESS))) { if (!(SYMBOL_REF_FLAGS (symbol) & SYMBOL_FLAG_ADDRESS)) @@ -12715,7 +12715,7 @@ avr_addr_space_convert (rtx src, tree type_from, tree type_to) but are located in flash. In that case we patch the incoming address space. */ - if (SYMBOL_REF == GET_CODE (sym) + if (SYMBOL_REF_P (sym) && ADDR_SPACE_FLASH == AVR_SYMBOL_GET_ADDR_SPACE (sym)) { as_from = ADDR_SPACE_FLASH; -- cgit v1.1 From b0da97091dbeca1e2653208febaa747f4e5a85bb Mon Sep 17 00:00:00 2001 From: Georg-Johann Lay Date: Thu, 1 Dec 2016 10:21:31 +0000 Subject: tiny-memx.c: Only perform if target avr_tiny. gcc/testsuite/ * gcc.target/avr/tiny-memx.c: Only perform if target avr_tiny. * gcc.target/avr/tiny-caller-save.c: Dito. From-SVN: r243105 --- gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/gcc.target/avr/tiny-caller-save.c | 2 +- gcc/testsuite/gcc.target/avr/tiny-memx.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 98a14c6..ab55b43 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 Georg-Johann Lay + + * gcc.target/avr/tiny-memx.c: Only perform if target avr_tiny. + * gcc.target/avr/tiny-caller-save.c: Dito. + 2016-12-01 Andre Vehreschild * gfortran.dg/coarray_lib_alloc_4.f90: Fix for 32-bits. diff --git a/gcc/testsuite/gcc.target/avr/tiny-caller-save.c b/gcc/testsuite/gcc.target/avr/tiny-caller-save.c index 63fad3a..ff35161 100644 --- a/gcc/testsuite/gcc.target/avr/tiny-caller-save.c +++ b/gcc/testsuite/gcc.target/avr/tiny-caller-save.c @@ -1,4 +1,4 @@ -/* { dg-do compile } */ +/* { dg-do compile { target avr_tiny } } */ /* { dg-options "-mmcu=avrtiny -gdwarf -Os" } */ /* This is a stripped down piece of libgcc2.c that triggerd an ICE for avr with diff --git a/gcc/testsuite/gcc.target/avr/tiny-memx.c b/gcc/testsuite/gcc.target/avr/tiny-memx.c index cdda86b..f691dcf 100644 --- a/gcc/testsuite/gcc.target/avr/tiny-memx.c +++ b/gcc/testsuite/gcc.target/avr/tiny-memx.c @@ -1,4 +1,4 @@ -/* { dg-do compile } */ +/* { dg-do compile { target avr_tiny } } */ /* { dg-options "-mmcu=avrtiny" } */ const __memx char ascmonth[] = "Jan"; /* { dg-error "not supported" } */ -- cgit v1.1 From a1fc386ac283d309eb6af8cf7b009d528bc52a9a Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Thu, 1 Dec 2016 12:15:44 +0000 Subject: tree-ssa-alias.c (indirect_refs_may_alias_p): Do not treat arrays with same type as objects that cannot overlap. 2016-12-01 Richard Biener * tree-ssa-alias.c (indirect_refs_may_alias_p): Do not treat arrays with same type as objects that cannot overlap. * gcc.dg/torture/alias-2.c: New testcase. From-SVN: r243106 --- gcc/ChangeLog | 5 +++++ gcc/testsuite/ChangeLog | 4 ++++ gcc/testsuite/gcc.dg/torture/alias-2.c | 17 +++++++++++++++++ 3 files changed, 26 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/torture/alias-2.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 2683757..19cb0ce 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 Richard Biener + + * tree-ssa-alias.c (indirect_refs_may_alias_p): Do not + treat arrays with same type as objects that cannot overlap. + 2016-12-01 Georg-Johann Lay * config/avr/avr.c (avr_print_operand): Use SYMBOL_REF_P if possible. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index ab55b43..447d9fb 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2016-12-01 Richard Biener + + * gcc.dg/torture/alias-2.c: New testcase. + 2016-12-01 Georg-Johann Lay * gcc.target/avr/tiny-memx.c: Only perform if target avr_tiny. diff --git a/gcc/testsuite/gcc.dg/torture/alias-2.c b/gcc/testsuite/gcc.dg/torture/alias-2.c new file mode 100644 index 0000000..329d46a --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/alias-2.c @@ -0,0 +1,17 @@ +/* { dg-do run } */ + +/* We do not want to treat int[3] as an object that cannot overlap + itself but treat it as arbitrary sub-array of a larger array object. */ +int ar1(int (*p)[3], int (*q)[3]) +{ + (*p)[0] = 1; + (*q)[1] = 2; + return (*p)[0]; +} +int main() +{ + int a[4]; + if (ar1 ((int (*)[3])&a[1], (int (*)[3])&a[0]) != 2) + __builtin_abort (); + return 0; +} -- cgit v1.1 From a41e62e743af77da51413bd6f865f4c270f11674 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Thu, 1 Dec 2016 12:22:32 +0000 Subject: tree-ssa-alias.c (indirect_refs_may_alias_p): Do not treat arrays with same type as objects that cannot overlap. 2016-12-01 Richard Biener * tree-ssa-alias.c (indirect_refs_may_alias_p): Do not treat arrays with same type as objects that cannot overlap. * gcc.dg/torture/alias-2.c: New testcase. From-SVN: r243107 --- gcc/tree-ssa-alias.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c index ebae6cf..10f1677 100644 --- a/gcc/tree-ssa-alias.c +++ b/gcc/tree-ssa-alias.c @@ -1355,7 +1355,10 @@ indirect_refs_may_alias_p (tree ref1 ATTRIBUTE_UNUSED, tree base1, && same_type_for_tbaa (TREE_TYPE (base1), TREE_TYPE (ptrtype1)) == 1 && same_type_for_tbaa (TREE_TYPE (base2), TREE_TYPE (ptrtype2)) == 1 && same_type_for_tbaa (TREE_TYPE (ptrtype1), - TREE_TYPE (ptrtype2)) == 1) + TREE_TYPE (ptrtype2)) == 1 + /* But avoid treating arrays as "objects", instead assume they + can overlap by an exact multiple of their element size. */ + && TREE_CODE (TREE_TYPE (ptrtype1)) != ARRAY_TYPE) return ranges_overlap_p (offset1, max_size1, offset2, max_size2); /* Do type-based disambiguation. */ -- cgit v1.1 From 825fba526ce816bd26887457e2f0f4ca05b1ccdd Mon Sep 17 00:00:00 2001 From: Matthias Klose Date: Thu, 1 Dec 2016 12:31:49 +0000 Subject: configure.ac: Don't use pkg-config to check for bdw-gc. * configure.ac: Don't use pkg-config to check for bdw-gc. * configure: Regenerate. config/ * pkg.m4: Remove. libobjc/ * configure.ac: Don't use pkg-config to check for bdw-gc. * configure: Regenerate. gcc/ * doc/install.texi: Don't use pkg-config to check for bdw-gc. From-SVN: r243108 --- ChangeLog | 5 + config/ChangeLog | 4 + config/pkg.m4 | 825 --------------------------------------------------- configure | 144 +-------- configure.ac | 7 +- gcc/ChangeLog | 5 + gcc/doc/install.texi | 3 +- libobjc/ChangeLog | 5 + libobjc/configure | 152 +--------- libobjc/configure.ac | 11 +- 10 files changed, 32 insertions(+), 1129 deletions(-) delete mode 100644 config/pkg.m4 diff --git a/ChangeLog b/ChangeLog index 7876d60..a3320f1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 Matthias Klose + + * configure.ac: Don't use pkg-config to check for bdw-gc. + * configure: Regenerate. + 2016-11-30 Matthias Klose * Makefile.def: Remove reference to boehm-gc target module. diff --git a/config/ChangeLog b/config/ChangeLog index ed59787..8dcb483 100644 --- a/config/ChangeLog +++ b/config/ChangeLog @@ -1,3 +1,7 @@ +2016-12-01 Matthias Klose + + * pkg.m4: Remove. + 2016-11-30 Matthias Klose * pkg.m4: New file. diff --git a/config/pkg.m4 b/config/pkg.m4 deleted file mode 100644 index 0301d27..0000000 --- a/config/pkg.m4 +++ /dev/null @@ -1,825 +0,0 @@ -dnl pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- -dnl serial 11 (pkg-config-0.29) -dnl -dnl Copyright © 2004 Scott James Remnant . -dnl Copyright © 2012-2015 Dan Nicholson -dnl -dnl This program is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU General Public License as published by -dnl the Free Software Foundation; either version 2 of the License, or -dnl (at your option) any later version. -dnl -dnl This program is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -dnl General Public License for more details. -dnl -dnl You should have received a copy of the GNU General Public License -dnl along with this program; if not, write to the Free Software -dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA -dnl 02111-1307, USA. -dnl -dnl As a special exception to the GNU General Public License, if you -dnl distribute this file as part of a program that contains a -dnl configuration script generated by Autoconf, you may include it under -dnl the same distribution terms that you use for the rest of that -dnl program. - -dnl PKG_PREREQ(MIN-VERSION) -dnl ----------------------- -dnl Since: 0.29 -dnl -dnl Verify that the version of the pkg-config macros are at least -dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's -dnl installed version of pkg-config, this checks the developer's version -dnl of pkg.m4 when generating configure. -dnl -dnl To ensure that this macro is defined, also add: -dnl m4_ifndef([PKG_PREREQ], -dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])]) -dnl -dnl See the "Since" comment for each macro you use to see what version -dnl of the macros you require. -m4_defun([PKG_PREREQ], -[m4_define([PKG_MACROS_VERSION], [0.29]) -m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1, - [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])]) -])dnl PKG_PREREQ - -dnl PKG_PROG_PKG_CONFIG([MIN-VERSION]) -dnl ---------------------------------- -dnl Since: 0.16 -dnl -dnl Search for the pkg-config tool and set the PKG_CONFIG variable to -dnl first found in the path. Checks that the version of pkg-config found -dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is -dnl used since that's the first version where most current features of -dnl pkg-config existed. -AC_DEFUN([PKG_PROG_PKG_CONFIG], -[m4_pattern_forbid([^_?PKG_[A-Z_]+$]) -m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$]) -m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$]) -AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility]) -AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path]) -AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path]) - -if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then - AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) -fi -if test -n "$PKG_CONFIG"; then - _pkg_min_version=m4_default([$1], [0.9.0]) - AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) - if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - PKG_CONFIG="" - fi -fi[]dnl -])dnl PKG_PROG_PKG_CONFIG - -dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -dnl ------------------------------------------------------------------- -dnl Since: 0.18 -dnl -dnl Check to see whether a particular set of modules exists. Similar to -dnl PKG_CHECK_MODULES(), but does not set variables or print errors. -dnl -dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) -dnl only at the first occurence in configure.ac, so if the first place -dnl it's called might be skipped (such as if it is within an "if", you -dnl have to call PKG_CHECK_EXISTS manually -AC_DEFUN([PKG_CHECK_EXISTS], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -if test -n "$PKG_CONFIG" && \ - AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then - m4_default([$2], [:]) -m4_ifvaln([$3], [else - $3])dnl -fi]) - -dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) -dnl --------------------------------------------- -dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting -dnl pkg_failed based on the result. -m4_define([_PKG_CONFIG], -[if test -n "$$1"; then - pkg_cv_[]$1="$$1" - elif test -n "$PKG_CONFIG"; then - PKG_CHECK_EXISTS([$3], - [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes ], - [pkg_failed=yes]) - else - pkg_failed=untried -fi[]dnl -])dnl _PKG_CONFIG - -dnl _PKG_SHORT_ERRORS_SUPPORTED -dnl --------------------------- -dnl Internal check to see if pkg-config supports short errors. -AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG]) -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi[]dnl -])dnl _PKG_SHORT_ERRORS_SUPPORTED - - -dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], -dnl [ACTION-IF-NOT-FOUND]) -dnl -------------------------------------------------------------- -dnl Since: 0.4.0 -dnl -dnl Note that if there is a possibility the first call to -dnl PKG_CHECK_MODULES might not happen, you should be sure to include an -dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac -AC_DEFUN([PKG_CHECK_MODULES], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl -AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl - -pkg_failed=no -AC_MSG_CHECKING([for $1]) - -_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) -_PKG_CONFIG([$1][_LIBS], [libs], [$2]) - -m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS -and $1[]_LIBS to avoid the need to call pkg-config. -See the pkg-config man page for more details.]) - -if test $pkg_failed = yes; then - AC_MSG_RESULT([no]) - _PKG_SHORT_ERRORS_SUPPORTED - if test $_pkg_short_errors_supported = yes; then - $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1` - else - $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD - - m4_default([$4], [AC_MSG_ERROR( -[Package requirements ($2) were not met: - -$$1_PKG_ERRORS - -Consider adjusting the PKG_CONFIG_PATH environment variable if you -installed software in a non-standard prefix. - -_PKG_TEXT])[]dnl - ]) -elif test $pkg_failed = untried; then - AC_MSG_RESULT([no]) - m4_default([$4], [AC_MSG_FAILURE( -[The pkg-config script could not be found or is too old. Make sure it -is in your PATH or set the PKG_CONFIG environment variable to the full -path to pkg-config. - -_PKG_TEXT - -To get pkg-config, see .])[]dnl - ]) -else - $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS - $1[]_LIBS=$pkg_cv_[]$1[]_LIBS - AC_MSG_RESULT([yes]) - $3 -fi[]dnl -])dnl PKG_CHECK_MODULES - - -dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], -dnl [ACTION-IF-NOT-FOUND]) -dnl --------------------------------------------------------------------- -dnl Since: 0.29 -dnl -dnl Checks for existence of MODULES and gathers its build flags with -dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags -dnl and VARIABLE-PREFIX_LIBS from --libs. -dnl -dnl Note that if there is a possibility the first call to -dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to -dnl include an explicit call to PKG_PROG_PKG_CONFIG in your -dnl configure.ac. -AC_DEFUN([PKG_CHECK_MODULES_STATIC], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -_save_PKG_CONFIG=$PKG_CONFIG -PKG_CONFIG="$PKG_CONFIG --static" -PKG_CHECK_MODULES($@) -PKG_CONFIG=$_save_PKG_CONFIG[]dnl -])dnl PKG_CHECK_MODULES_STATIC - - -dnl PKG_INSTALLDIR([DIRECTORY]) -dnl ------------------------- -dnl Since: 0.27 -dnl -dnl Substitutes the variable pkgconfigdir as the location where a module -dnl should install pkg-config .pc files. By default the directory is -dnl $libdir/pkgconfig, but the default can be changed by passing -dnl DIRECTORY. The user can override through the --with-pkgconfigdir -dnl parameter. -AC_DEFUN([PKG_INSTALLDIR], -[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])]) -m4_pushdef([pkg_description], - [pkg-config installation directory @<:@]pkg_default[@:>@]) -AC_ARG_WITH([pkgconfigdir], - [AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],, - [with_pkgconfigdir=]pkg_default) -AC_SUBST([pkgconfigdir], [$with_pkgconfigdir]) -m4_popdef([pkg_default]) -m4_popdef([pkg_description]) -])dnl PKG_INSTALLDIR - - -dnl PKG_NOARCH_INSTALLDIR([DIRECTORY]) -dnl -------------------------------- -dnl Since: 0.27 -dnl -dnl Substitutes the variable noarch_pkgconfigdir as the location where a -dnl module should install arch-independent pkg-config .pc files. By -dnl default the directory is $datadir/pkgconfig, but the default can be -dnl changed by passing DIRECTORY. The user can override through the -dnl --with-noarch-pkgconfigdir parameter. -AC_DEFUN([PKG_NOARCH_INSTALLDIR], -[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])]) -m4_pushdef([pkg_description], - [pkg-config arch-independent installation directory @<:@]pkg_default[@:>@]) -AC_ARG_WITH([noarch-pkgconfigdir], - [AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],, - [with_noarch_pkgconfigdir=]pkg_default) -AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir]) -m4_popdef([pkg_default]) -m4_popdef([pkg_description]) -])dnl PKG_NOARCH_INSTALLDIR - - -dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE, -dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -dnl ------------------------------------------- -dnl Since: 0.28 -dnl -dnl Retrieves the value of the pkg-config variable for the given module. -AC_DEFUN([PKG_CHECK_VAR], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl - -_PKG_CONFIG([$1], [variable="][$3]["], [$2]) -AS_VAR_COPY([$1], [pkg_cv_][$1]) - -AS_VAR_IF([$1], [""], [$5], [$4])dnl -])dnl PKG_CHECK_VAR -dnl pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- -dnl serial 11 (pkg-config-0.29) -dnl -dnl Copyright © 2004 Scott James Remnant . -dnl Copyright © 2012-2015 Dan Nicholson -dnl -dnl This program is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU General Public License as published by -dnl the Free Software Foundation; either version 2 of the License, or -dnl (at your option) any later version. -dnl -dnl This program is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -dnl General Public License for more details. -dnl -dnl You should have received a copy of the GNU General Public License -dnl along with this program; if not, write to the Free Software -dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA -dnl 02111-1307, USA. -dnl -dnl As a special exception to the GNU General Public License, if you -dnl distribute this file as part of a program that contains a -dnl configuration script generated by Autoconf, you may include it under -dnl the same distribution terms that you use for the rest of that -dnl program. - -dnl PKG_PREREQ(MIN-VERSION) -dnl ----------------------- -dnl Since: 0.29 -dnl -dnl Verify that the version of the pkg-config macros are at least -dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's -dnl installed version of pkg-config, this checks the developer's version -dnl of pkg.m4 when generating configure. -dnl -dnl To ensure that this macro is defined, also add: -dnl m4_ifndef([PKG_PREREQ], -dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])]) -dnl -dnl See the "Since" comment for each macro you use to see what version -dnl of the macros you require. -m4_defun([PKG_PREREQ], -[m4_define([PKG_MACROS_VERSION], [0.29]) -m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1, - [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])]) -])dnl PKG_PREREQ - -dnl PKG_PROG_PKG_CONFIG([MIN-VERSION]) -dnl ---------------------------------- -dnl Since: 0.16 -dnl -dnl Search for the pkg-config tool and set the PKG_CONFIG variable to -dnl first found in the path. Checks that the version of pkg-config found -dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is -dnl used since that's the first version where most current features of -dnl pkg-config existed. -AC_DEFUN([PKG_PROG_PKG_CONFIG], -[m4_pattern_forbid([^_?PKG_[A-Z_]+$]) -m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$]) -m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$]) -AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility]) -AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path]) -AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path]) - -if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then - AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) -fi -if test -n "$PKG_CONFIG"; then - _pkg_min_version=m4_default([$1], [0.9.0]) - AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) - if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - PKG_CONFIG="" - fi -fi[]dnl -])dnl PKG_PROG_PKG_CONFIG - -dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -dnl ------------------------------------------------------------------- -dnl Since: 0.18 -dnl -dnl Check to see whether a particular set of modules exists. Similar to -dnl PKG_CHECK_MODULES(), but does not set variables or print errors. -dnl -dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) -dnl only at the first occurence in configure.ac, so if the first place -dnl it's called might be skipped (such as if it is within an "if", you -dnl have to call PKG_CHECK_EXISTS manually -AC_DEFUN([PKG_CHECK_EXISTS], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -if test -n "$PKG_CONFIG" && \ - AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then - m4_default([$2], [:]) -m4_ifvaln([$3], [else - $3])dnl -fi]) - -dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) -dnl --------------------------------------------- -dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting -dnl pkg_failed based on the result. -m4_define([_PKG_CONFIG], -[if test -n "$$1"; then - pkg_cv_[]$1="$$1" - elif test -n "$PKG_CONFIG"; then - PKG_CHECK_EXISTS([$3], - [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes ], - [pkg_failed=yes]) - else - pkg_failed=untried -fi[]dnl -])dnl _PKG_CONFIG - -dnl _PKG_SHORT_ERRORS_SUPPORTED -dnl --------------------------- -dnl Internal check to see if pkg-config supports short errors. -AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG]) -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi[]dnl -])dnl _PKG_SHORT_ERRORS_SUPPORTED - - -dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], -dnl [ACTION-IF-NOT-FOUND]) -dnl -------------------------------------------------------------- -dnl Since: 0.4.0 -dnl -dnl Note that if there is a possibility the first call to -dnl PKG_CHECK_MODULES might not happen, you should be sure to include an -dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac -AC_DEFUN([PKG_CHECK_MODULES], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl -AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl - -pkg_failed=no -AC_MSG_CHECKING([for $1]) - -_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) -_PKG_CONFIG([$1][_LIBS], [libs], [$2]) - -m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS -and $1[]_LIBS to avoid the need to call pkg-config. -See the pkg-config man page for more details.]) - -if test $pkg_failed = yes; then - AC_MSG_RESULT([no]) - _PKG_SHORT_ERRORS_SUPPORTED - if test $_pkg_short_errors_supported = yes; then - $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1` - else - $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD - - m4_default([$4], [AC_MSG_ERROR( -[Package requirements ($2) were not met: - -$$1_PKG_ERRORS - -Consider adjusting the PKG_CONFIG_PATH environment variable if you -installed software in a non-standard prefix. - -_PKG_TEXT])[]dnl - ]) -elif test $pkg_failed = untried; then - AC_MSG_RESULT([no]) - m4_default([$4], [AC_MSG_FAILURE( -[The pkg-config script could not be found or is too old. Make sure it -is in your PATH or set the PKG_CONFIG environment variable to the full -path to pkg-config. - -_PKG_TEXT - -To get pkg-config, see .])[]dnl - ]) -else - $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS - $1[]_LIBS=$pkg_cv_[]$1[]_LIBS - AC_MSG_RESULT([yes]) - $3 -fi[]dnl -])dnl PKG_CHECK_MODULES - - -dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], -dnl [ACTION-IF-NOT-FOUND]) -dnl --------------------------------------------------------------------- -dnl Since: 0.29 -dnl -dnl Checks for existence of MODULES and gathers its build flags with -dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags -dnl and VARIABLE-PREFIX_LIBS from --libs. -dnl -dnl Note that if there is a possibility the first call to -dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to -dnl include an explicit call to PKG_PROG_PKG_CONFIG in your -dnl configure.ac. -AC_DEFUN([PKG_CHECK_MODULES_STATIC], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -_save_PKG_CONFIG=$PKG_CONFIG -PKG_CONFIG="$PKG_CONFIG --static" -PKG_CHECK_MODULES($@) -PKG_CONFIG=$_save_PKG_CONFIG[]dnl -])dnl PKG_CHECK_MODULES_STATIC - - -dnl PKG_INSTALLDIR([DIRECTORY]) -dnl ------------------------- -dnl Since: 0.27 -dnl -dnl Substitutes the variable pkgconfigdir as the location where a module -dnl should install pkg-config .pc files. By default the directory is -dnl $libdir/pkgconfig, but the default can be changed by passing -dnl DIRECTORY. The user can override through the --with-pkgconfigdir -dnl parameter. -AC_DEFUN([PKG_INSTALLDIR], -[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])]) -m4_pushdef([pkg_description], - [pkg-config installation directory @<:@]pkg_default[@:>@]) -AC_ARG_WITH([pkgconfigdir], - [AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],, - [with_pkgconfigdir=]pkg_default) -AC_SUBST([pkgconfigdir], [$with_pkgconfigdir]) -m4_popdef([pkg_default]) -m4_popdef([pkg_description]) -])dnl PKG_INSTALLDIR - - -dnl PKG_NOARCH_INSTALLDIR([DIRECTORY]) -dnl -------------------------------- -dnl Since: 0.27 -dnl -dnl Substitutes the variable noarch_pkgconfigdir as the location where a -dnl module should install arch-independent pkg-config .pc files. By -dnl default the directory is $datadir/pkgconfig, but the default can be -dnl changed by passing DIRECTORY. The user can override through the -dnl --with-noarch-pkgconfigdir parameter. -AC_DEFUN([PKG_NOARCH_INSTALLDIR], -[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])]) -m4_pushdef([pkg_description], - [pkg-config arch-independent installation directory @<:@]pkg_default[@:>@]) -AC_ARG_WITH([noarch-pkgconfigdir], - [AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],, - [with_noarch_pkgconfigdir=]pkg_default) -AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir]) -m4_popdef([pkg_default]) -m4_popdef([pkg_description]) -])dnl PKG_NOARCH_INSTALLDIR - - -dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE, -dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -dnl ------------------------------------------- -dnl Since: 0.28 -dnl -dnl Retrieves the value of the pkg-config variable for the given module. -AC_DEFUN([PKG_CHECK_VAR], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl - -_PKG_CONFIG([$1], [variable="][$3]["], [$2]) -AS_VAR_COPY([$1], [pkg_cv_][$1]) - -AS_VAR_IF([$1], [""], [$5], [$4])dnl -])dnl PKG_CHECK_VAR -dnl pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- -dnl serial 11 (pkg-config-0.29) -dnl -dnl Copyright © 2004 Scott James Remnant . -dnl Copyright © 2012-2015 Dan Nicholson -dnl -dnl This program is free software; you can redistribute it and/or modify -dnl it under the terms of the GNU General Public License as published by -dnl the Free Software Foundation; either version 2 of the License, or -dnl (at your option) any later version. -dnl -dnl This program is distributed in the hope that it will be useful, but -dnl WITHOUT ANY WARRANTY; without even the implied warranty of -dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -dnl General Public License for more details. -dnl -dnl You should have received a copy of the GNU General Public License -dnl along with this program; if not, write to the Free Software -dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA -dnl 02111-1307, USA. -dnl -dnl As a special exception to the GNU General Public License, if you -dnl distribute this file as part of a program that contains a -dnl configuration script generated by Autoconf, you may include it under -dnl the same distribution terms that you use for the rest of that -dnl program. - -dnl PKG_PREREQ(MIN-VERSION) -dnl ----------------------- -dnl Since: 0.29 -dnl -dnl Verify that the version of the pkg-config macros are at least -dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's -dnl installed version of pkg-config, this checks the developer's version -dnl of pkg.m4 when generating configure. -dnl -dnl To ensure that this macro is defined, also add: -dnl m4_ifndef([PKG_PREREQ], -dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])]) -dnl -dnl See the "Since" comment for each macro you use to see what version -dnl of the macros you require. -m4_defun([PKG_PREREQ], -[m4_define([PKG_MACROS_VERSION], [0.29]) -m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1, - [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])]) -])dnl PKG_PREREQ - -dnl PKG_PROG_PKG_CONFIG([MIN-VERSION]) -dnl ---------------------------------- -dnl Since: 0.16 -dnl -dnl Search for the pkg-config tool and set the PKG_CONFIG variable to -dnl first found in the path. Checks that the version of pkg-config found -dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is -dnl used since that's the first version where most current features of -dnl pkg-config existed. -AC_DEFUN([PKG_PROG_PKG_CONFIG], -[m4_pattern_forbid([^_?PKG_[A-Z_]+$]) -m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$]) -m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$]) -AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility]) -AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path]) -AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path]) - -if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then - AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) -fi -if test -n "$PKG_CONFIG"; then - _pkg_min_version=m4_default([$1], [0.9.0]) - AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) - if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - PKG_CONFIG="" - fi -fi[]dnl -])dnl PKG_PROG_PKG_CONFIG - -dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -dnl ------------------------------------------------------------------- -dnl Since: 0.18 -dnl -dnl Check to see whether a particular set of modules exists. Similar to -dnl PKG_CHECK_MODULES(), but does not set variables or print errors. -dnl -dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) -dnl only at the first occurence in configure.ac, so if the first place -dnl it's called might be skipped (such as if it is within an "if", you -dnl have to call PKG_CHECK_EXISTS manually -AC_DEFUN([PKG_CHECK_EXISTS], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -if test -n "$PKG_CONFIG" && \ - AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then - m4_default([$2], [:]) -m4_ifvaln([$3], [else - $3])dnl -fi]) - -dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) -dnl --------------------------------------------- -dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting -dnl pkg_failed based on the result. -m4_define([_PKG_CONFIG], -[if test -n "$$1"; then - pkg_cv_[]$1="$$1" - elif test -n "$PKG_CONFIG"; then - PKG_CHECK_EXISTS([$3], - [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null` - test "x$?" != "x0" && pkg_failed=yes ], - [pkg_failed=yes]) - else - pkg_failed=untried -fi[]dnl -])dnl _PKG_CONFIG - -dnl _PKG_SHORT_ERRORS_SUPPORTED -dnl --------------------------- -dnl Internal check to see if pkg-config supports short errors. -AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG]) -if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then - _pkg_short_errors_supported=yes -else - _pkg_short_errors_supported=no -fi[]dnl -])dnl _PKG_SHORT_ERRORS_SUPPORTED - - -dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], -dnl [ACTION-IF-NOT-FOUND]) -dnl -------------------------------------------------------------- -dnl Since: 0.4.0 -dnl -dnl Note that if there is a possibility the first call to -dnl PKG_CHECK_MODULES might not happen, you should be sure to include an -dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac -AC_DEFUN([PKG_CHECK_MODULES], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl -AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl - -pkg_failed=no -AC_MSG_CHECKING([for $1]) - -_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) -_PKG_CONFIG([$1][_LIBS], [libs], [$2]) - -m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS -and $1[]_LIBS to avoid the need to call pkg-config. -See the pkg-config man page for more details.]) - -if test $pkg_failed = yes; then - AC_MSG_RESULT([no]) - _PKG_SHORT_ERRORS_SUPPORTED - if test $_pkg_short_errors_supported = yes; then - $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1` - else - $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1` - fi - # Put the nasty error message in config.log where it belongs - echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD - - m4_default([$4], [AC_MSG_ERROR( -[Package requirements ($2) were not met: - -$$1_PKG_ERRORS - -Consider adjusting the PKG_CONFIG_PATH environment variable if you -installed software in a non-standard prefix. - -_PKG_TEXT])[]dnl - ]) -elif test $pkg_failed = untried; then - AC_MSG_RESULT([no]) - m4_default([$4], [AC_MSG_FAILURE( -[The pkg-config script could not be found or is too old. Make sure it -is in your PATH or set the PKG_CONFIG environment variable to the full -path to pkg-config. - -_PKG_TEXT - -To get pkg-config, see .])[]dnl - ]) -else - $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS - $1[]_LIBS=$pkg_cv_[]$1[]_LIBS - AC_MSG_RESULT([yes]) - $3 -fi[]dnl -])dnl PKG_CHECK_MODULES - - -dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], -dnl [ACTION-IF-NOT-FOUND]) -dnl --------------------------------------------------------------------- -dnl Since: 0.29 -dnl -dnl Checks for existence of MODULES and gathers its build flags with -dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags -dnl and VARIABLE-PREFIX_LIBS from --libs. -dnl -dnl Note that if there is a possibility the first call to -dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to -dnl include an explicit call to PKG_PROG_PKG_CONFIG in your -dnl configure.ac. -AC_DEFUN([PKG_CHECK_MODULES_STATIC], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -_save_PKG_CONFIG=$PKG_CONFIG -PKG_CONFIG="$PKG_CONFIG --static" -PKG_CHECK_MODULES($@) -PKG_CONFIG=$_save_PKG_CONFIG[]dnl -])dnl PKG_CHECK_MODULES_STATIC - - -dnl PKG_INSTALLDIR([DIRECTORY]) -dnl ------------------------- -dnl Since: 0.27 -dnl -dnl Substitutes the variable pkgconfigdir as the location where a module -dnl should install pkg-config .pc files. By default the directory is -dnl $libdir/pkgconfig, but the default can be changed by passing -dnl DIRECTORY. The user can override through the --with-pkgconfigdir -dnl parameter. -AC_DEFUN([PKG_INSTALLDIR], -[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])]) -m4_pushdef([pkg_description], - [pkg-config installation directory @<:@]pkg_default[@:>@]) -AC_ARG_WITH([pkgconfigdir], - [AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],, - [with_pkgconfigdir=]pkg_default) -AC_SUBST([pkgconfigdir], [$with_pkgconfigdir]) -m4_popdef([pkg_default]) -m4_popdef([pkg_description]) -])dnl PKG_INSTALLDIR - - -dnl PKG_NOARCH_INSTALLDIR([DIRECTORY]) -dnl -------------------------------- -dnl Since: 0.27 -dnl -dnl Substitutes the variable noarch_pkgconfigdir as the location where a -dnl module should install arch-independent pkg-config .pc files. By -dnl default the directory is $datadir/pkgconfig, but the default can be -dnl changed by passing DIRECTORY. The user can override through the -dnl --with-noarch-pkgconfigdir parameter. -AC_DEFUN([PKG_NOARCH_INSTALLDIR], -[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])]) -m4_pushdef([pkg_description], - [pkg-config arch-independent installation directory @<:@]pkg_default[@:>@]) -AC_ARG_WITH([noarch-pkgconfigdir], - [AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],, - [with_noarch_pkgconfigdir=]pkg_default) -AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir]) -m4_popdef([pkg_default]) -m4_popdef([pkg_description]) -])dnl PKG_NOARCH_INSTALLDIR - - -dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE, -dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) -dnl ------------------------------------------- -dnl Since: 0.28 -dnl -dnl Retrieves the value of the pkg-config variable for the given module. -AC_DEFUN([PKG_CHECK_VAR], -[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl -AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl - -_PKG_CONFIG([$1], [variable="][$3]["], [$2]) -AS_VAR_COPY([$1], [pkg_cv_][$1]) - -AS_VAR_IF([$1], [""], [$5], [$4])dnl -])dnl PKG_CHECK_VAR diff --git a/configure b/configure index cec133e..fb79e73 100755 --- a/configure +++ b/configure @@ -643,9 +643,6 @@ DEBUG_PREFIX_CFLAGS_FOR_TARGET SYSROOT_CFLAGS_FOR_TARGET extra_host_zlib_configure_flags extra_host_libiberty_configure_flags -PKG_CONFIG_LIBDIR -PKG_CONFIG_PATH -PKG_CONFIG stage1_languages extra_linker_plugin_flags extra_linker_plugin_configure_flags @@ -811,9 +808,6 @@ CPPFLAGS CXX CXXFLAGS CCC -PKG_CONFIG -PKG_CONFIG_PATH -PKG_CONFIG_LIBDIR build_configargs host_configargs target_configargs @@ -1587,11 +1581,6 @@ Some influential environment variables: you have headers in a nonstandard directory CXX C++ compiler command CXXFLAGS C++ compiler flags - PKG_CONFIG path to pkg-config utility - PKG_CONFIG_PATH - directories to add to pkg-config's search path - PKG_CONFIG_LIBDIR - path overriding pkg-config's built-in search path build_configargs additional configure arguments for build directories host_configargs @@ -6441,137 +6430,8 @@ case ,${enable_languages},:${enable_objc_gc} in *,objc,*:yes|*,objc,*:auto) { $as_echo "$as_me:${as_lineno-$LINENO}: checking for bdw garbage collector" >&5 $as_echo_n "checking for bdw garbage collector... " >&6; } if test "x$with_target_bdw_gc$with_target_bdw_gc_include$with_target_bdw_gc_lib" = x; then - - - - - - - -if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then - if test -n "$ac_tool_prefix"; then - # Extract the first word of "${ac_tool_prefix}pkg-config", so it can be a program name with args. -set dummy ${ac_tool_prefix}pkg-config; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if test "${ac_cv_path_PKG_CONFIG+set}" = set; then : - $as_echo_n "(cached) " >&6 -else - case $PKG_CONFIG in - [\\/]* | ?:[\\/]*) - ac_cv_path_PKG_CONFIG="$PKG_CONFIG" # Let the user override the test with a path. - ;; - *) - as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then - ac_cv_path_PKG_CONFIG="$as_dir/$ac_word$ac_exec_ext" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done - done -IFS=$as_save_IFS - - ;; -esac -fi -PKG_CONFIG=$ac_cv_path_PKG_CONFIG -if test -n "$PKG_CONFIG"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $PKG_CONFIG" >&5 -$as_echo "$PKG_CONFIG" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } -fi - - -fi -if test -z "$ac_cv_path_PKG_CONFIG"; then - ac_pt_PKG_CONFIG=$PKG_CONFIG - # Extract the first word of "pkg-config", so it can be a program name with args. -set dummy pkg-config; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if test "${ac_cv_path_ac_pt_PKG_CONFIG+set}" = set; then : - $as_echo_n "(cached) " >&6 -else - case $ac_pt_PKG_CONFIG in - [\\/]* | ?:[\\/]*) - ac_cv_path_ac_pt_PKG_CONFIG="$ac_pt_PKG_CONFIG" # Let the user override the test with a path. - ;; - *) - as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then - ac_cv_path_ac_pt_PKG_CONFIG="$as_dir/$ac_word$ac_exec_ext" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done - done -IFS=$as_save_IFS - - ;; -esac -fi -ac_pt_PKG_CONFIG=$ac_cv_path_ac_pt_PKG_CONFIG -if test -n "$ac_pt_PKG_CONFIG"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_PKG_CONFIG" >&5 -$as_echo "$ac_pt_PKG_CONFIG" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } -fi - - if test "x$ac_pt_PKG_CONFIG" = x; then - PKG_CONFIG="" - else - case $cross_compiling:$ac_tool_warned in -yes:) -{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 -$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} -ac_tool_warned=yes ;; -esac - PKG_CONFIG=$ac_pt_PKG_CONFIG - fi -else - PKG_CONFIG="$ac_cv_path_PKG_CONFIG" -fi - -fi -if test -n "$PKG_CONFIG"; then - _pkg_min_version=0.9.0 - { $as_echo "$as_me:${as_lineno-$LINENO}: checking pkg-config is at least version $_pkg_min_version" >&5 -$as_echo_n "checking pkg-config is at least version $_pkg_min_version... " >&6; } - if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - PKG_CONFIG="" - fi -fi -if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"bdw-gc\""; } >&5 - ($PKG_CONFIG --exists --print-errors "bdw-gc") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: using bdw-gc pkg-config module" >&5 -$as_echo "using bdw-gc pkg-config module" >&6; } -else - as_fn_error "no --with-target-bdw-gc options and no bdw-gc pkg-config module found" "$LINENO" 5 -fi + { $as_echo "$as_me:${as_lineno-$LINENO}: result: using bdw-gc in default locations" >&5 +$as_echo "using bdw-gc in default locations" >&6; } else if test "x$with_target_bdw_gc_include" = x && test "x$with_target_bdw_gc_lib" != x; then as_fn_error "found --with-target-bdw-gc-lib but --with-target-bdw-gc-include missing" "$LINENO" 5 diff --git a/configure.ac b/configure.ac index ed89ea2..51ee705 100644 --- a/configure.ac +++ b/configure.ac @@ -29,7 +29,6 @@ m4_include([ltsugar.m4]) m4_include([ltversion.m4]) m4_include([lt~obsolete.m4]) m4_include([config/isl.m4]) -m4_include([config/pkg.m4]) AC_INIT(move-if-change) AC_PREREQ(2.64) @@ -2076,10 +2075,8 @@ AC_ARG_WITH([target-bdw-gc-lib], case ,${enable_languages},:${enable_objc_gc} in *,objc,*:yes|*,objc,*:auto) AC_MSG_CHECKING([for bdw garbage collector]) if test "x$with_target_bdw_gc$with_target_bdw_gc_include$with_target_bdw_gc_lib" = x; then - dnl no bdw-gw options, fall back to the bdw-gc pkg-config module - PKG_CHECK_EXISTS(bdw-gc, - AC_MSG_RESULT([using bdw-gc pkg-config module]), - AC_MSG_ERROR([no --with-target-bdw-gc options and no bdw-gc pkg-config module found])) + dnl no bdw-gw options, assume default locations + AC_MSG_RESULT([using bdw-gc in default locations]) else dnl bdw-gw options, first error checking, complete checking in libobjc if test "x$with_target_bdw_gc_include" = x && test "x$with_target_bdw_gc_lib" != x; then diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 19cb0ce..a164a01 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,7 @@ +2016-12-01 Matthias Klose + + * doc/install.texi: Don't use pkg-config to check for bdw-gc. + 2016-12-01 Richard Biener * tree-ssa-alias.c (indirect_refs_may_alias_p): Do not @@ -34,6 +38,7 @@ PR rtl-optimization/78607 * combine.c (try_combine): Emit a barrier after a unconditional trap. +>>>>>>> .r243107 2016-11-30 Michael Meissner PR target/78602 diff --git a/gcc/doc/install.texi b/gcc/doc/install.texi index 140ff80..b911d76 100644 --- a/gcc/doc/install.texi +++ b/gcc/doc/install.texi @@ -2204,8 +2204,7 @@ The options @option{--with-target-bdw-gc-include} and @option{--with-target-bdw-gc-lib} must always be specified together for each multilib variant and they take precedence over @option{--with-target-bdw-gc}. If none of these options are -specified, the values are taken from the @command{pkg-config} -@samp{bdw-gc} module. +specified, the library is assumed in default locations. @end table @html diff --git a/libobjc/ChangeLog b/libobjc/ChangeLog index f6eadaf..60dfc16 100644 --- a/libobjc/ChangeLog +++ b/libobjc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 Matthias Klose + + * configure.ac: Don't use pkg-config to check for bdw-gc. + * configure: Regenerate. + 2016-11-30 Matthias Klose * configure.ac: Set BDW_GC_CFLAGS and BDW_GC_LIBS after checking diff --git a/libobjc/configure b/libobjc/configure index c617f27..62bdc2b 100755 --- a/libobjc/configure +++ b/libobjc/configure @@ -604,9 +604,6 @@ OBJC_BOEHM_GC_LIBS OBJC_BOEHM_GC_INCLUDES OBJC_BOEHM_GC OBJC_GCFLAGS -PKG_CONFIG_LIBDIR -PKG_CONFIG_PATH -PKG_CONFIG SET_MAKE CPP OTOOL64 @@ -733,10 +730,7 @@ with_target_bdw_gc_lib host_alias target_alias CPP -CPPFLAGS -PKG_CONFIG -PKG_CONFIG_PATH -PKG_CONFIG_LIBDIR' +CPPFLAGS' # Initialize some variables set by options. @@ -1395,11 +1389,6 @@ Some influential environment variables: CPPFLAGS C/C++/Objective C preprocessor flags, e.g. -I if you have headers in a nonstandard directory CPP C preprocessor - PKG_CONFIG path to pkg-config utility - PKG_CONFIG_PATH - directories to add to pkg-config's search path - PKG_CONFIG_LIBDIR - path overriding pkg-config's built-in search path Use these variables to override the choices made by `configure' or to help it to find libraries and programs with nonstandard names/locations. @@ -10601,7 +10590,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 10604 "configure" +#line 10593 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -10707,7 +10696,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 10710 "configure" +#line 10699 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -11540,139 +11529,8 @@ no) { $as_echo "$as_me:${as_lineno-$LINENO}: checking for bdw garbage collector" >&5 $as_echo_n "checking for bdw garbage collector... " >&6; } if test "x$with_target_bdw_gc$with_target_bdw_gc_include$with_target_bdw_gc_lib" = x; then - - - - - - - -if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then - if test -n "$ac_tool_prefix"; then - # Extract the first word of "${ac_tool_prefix}pkg-config", so it can be a program name with args. -set dummy ${ac_tool_prefix}pkg-config; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if test "${ac_cv_path_PKG_CONFIG+set}" = set; then : - $as_echo_n "(cached) " >&6 -else - case $PKG_CONFIG in - [\\/]* | ?:[\\/]*) - ac_cv_path_PKG_CONFIG="$PKG_CONFIG" # Let the user override the test with a path. - ;; - *) - as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then - ac_cv_path_PKG_CONFIG="$as_dir/$ac_word$ac_exec_ext" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done - done -IFS=$as_save_IFS - - ;; -esac -fi -PKG_CONFIG=$ac_cv_path_PKG_CONFIG -if test -n "$PKG_CONFIG"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $PKG_CONFIG" >&5 -$as_echo "$PKG_CONFIG" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } -fi - - -fi -if test -z "$ac_cv_path_PKG_CONFIG"; then - ac_pt_PKG_CONFIG=$PKG_CONFIG - # Extract the first word of "pkg-config", so it can be a program name with args. -set dummy pkg-config; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if test "${ac_cv_path_ac_pt_PKG_CONFIG+set}" = set; then : - $as_echo_n "(cached) " >&6 -else - case $ac_pt_PKG_CONFIG in - [\\/]* | ?:[\\/]*) - ac_cv_path_ac_pt_PKG_CONFIG="$ac_pt_PKG_CONFIG" # Let the user override the test with a path. - ;; - *) - as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then - ac_cv_path_ac_pt_PKG_CONFIG="$as_dir/$ac_word$ac_exec_ext" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done - done -IFS=$as_save_IFS - - ;; -esac -fi -ac_pt_PKG_CONFIG=$ac_cv_path_ac_pt_PKG_CONFIG -if test -n "$ac_pt_PKG_CONFIG"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_PKG_CONFIG" >&5 -$as_echo "$ac_pt_PKG_CONFIG" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } -fi - - if test "x$ac_pt_PKG_CONFIG" = x; then - PKG_CONFIG="" - else - case $cross_compiling:$ac_tool_warned in -yes:) -{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 -$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} -ac_tool_warned=yes ;; -esac - PKG_CONFIG=$ac_pt_PKG_CONFIG - fi -else - PKG_CONFIG="$ac_cv_path_PKG_CONFIG" -fi - -fi -if test -n "$PKG_CONFIG"; then - _pkg_min_version=0.9.0 - { $as_echo "$as_me:${as_lineno-$LINENO}: checking pkg-config is at least version $_pkg_min_version" >&5 -$as_echo_n "checking pkg-config is at least version $_pkg_min_version... " >&6; } - if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 -$as_echo "yes" >&6; } - else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } - PKG_CONFIG="" - fi -fi -if test -n "$PKG_CONFIG" && \ - { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"bdw-gc\""; } >&5 - ($PKG_CONFIG --exists --print-errors "bdw-gc") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: using bdw-gc pkg-config module" >&5 -$as_echo "using bdw-gc pkg-config module" >&6; } - BDW_GC_CFLAGS=`$PKG_CONFIG --cflags bdw-gc` - BDW_GC_LIBS=`$PKG_CONFIG --libs bdw-gc` -else - as_fn_error "no --with-target-bdw-gc options and no bdw-gc pkg-config module found" "$LINENO" 5 -fi + BDW_GC_CFLAGS= + BDW_GC_LIBS="-lgc" else if test "x$with_target_bdw_gc_include" = x && test "x$with_target_bdw_gc_lib" != x; then as_fn_error "found --with-target-bdw-gc-lib but --with-target-bdw-gc-include missing" "$LINENO" 5 diff --git a/libobjc/configure.ac b/libobjc/configure.ac index 2303143..3c13b33 100644 --- a/libobjc/configure.ac +++ b/libobjc/configure.ac @@ -18,8 +18,6 @@ #along with GCC; see the file COPYING3. If not see #. -m4_include([../config/pkg.m4]) - AC_PREREQ(2.64) AC_INIT(package-unused, version-unused,, libobjc) AC_CONFIG_SRCDIR([objc/objc.h]) @@ -223,12 +221,9 @@ no) *) AC_MSG_CHECKING([for bdw garbage collector]) if test "x$with_target_bdw_gc$with_target_bdw_gc_include$with_target_bdw_gc_lib" = x; then - dnl no bdw-gw options, fall back to the bdw-gc pkg-config module - PKG_CHECK_EXISTS(bdw-gc, - [AC_MSG_RESULT([using bdw-gc pkg-config module]) - BDW_GC_CFLAGS=`$PKG_CONFIG --cflags bdw-gc` - BDW_GC_LIBS=`$PKG_CONFIG --libs bdw-gc`], - AC_MSG_ERROR([no --with-target-bdw-gc options and no bdw-gc pkg-config module found])) + dnl no bdw-gw options, assuming bdw-gc in default locations + BDW_GC_CFLAGS= + BDW_GC_LIBS="-lgc" else dnl bdw-gw options passed by configure flags if test "x$with_target_bdw_gc_include" = x && test "x$with_target_bdw_gc_lib" != x; then -- cgit v1.1 From 8e9c33cd3bd55c2364a2efc7fc2de0957d7b991d Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Thu, 1 Dec 2016 08:48:22 -0500 Subject: Remove svn conflict marker. From-SVN: r243110 --- gcc/ChangeLog | 1 - 1 file changed, 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a164a01..4d15a73 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -38,7 +38,6 @@ PR rtl-optimization/78607 * combine.c (try_combine): Emit a barrier after a unconditional trap. ->>>>>>> .r243107 2016-11-30 Michael Meissner PR target/78602 -- cgit v1.1 From 67586d38f5b1858fba96973e9341f7d65f64ea9c Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Thu, 1 Dec 2016 14:04:13 +0000 Subject: Fix rtl-optimization/78596 - combine.c:12561:14: runtime error: left shift of negative value PR rtl-optimization/78596 * combine.c (simplify_comparison): Cast to unsigned to avoid left shifting of negative value. From-SVN: r243111 --- gcc/ChangeLog | 6 ++++++ gcc/combine.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 4d15a73..b90cbc6 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2016-12-01 Markus Trippelsdorf + + PR rtl-optimization/78596 + * combine.c (simplify_comparison): Cast to unsigned to avoid + left shifting of negative value. + 2016-12-01 Matthias Klose * doc/install.texi: Don't use pkg-config to check for bdw-gc. diff --git a/gcc/combine.c b/gcc/combine.c index faafcb7..b429453 100644 --- a/gcc/combine.c +++ b/gcc/combine.c @@ -12561,7 +12561,8 @@ simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1) if (GET_CODE (op0) == LSHIFTRT) code = unsigned_condition (code); - const_op <<= INTVAL (XEXP (op0, 1)); + const_op = (unsigned HOST_WIDE_INT) const_op + << INTVAL (XEXP (op0, 1)); if (low_bits != 0 && (code == GT || code == GTU || code == LE || code == LEU)) -- cgit v1.1 From be5ddbb86fbc4d7651f4c748528ecab6d31cd035 Mon Sep 17 00:00:00 2001 From: Rainer Orth Date: Thu, 1 Dec 2016 14:48:49 +0000 Subject: Import libcilkrts Build 4467 (PR target/68945) PR target/68945 Merge from upstream, version 2.0.4467.0. Fix typo in git URL. * aclocal.m4, configure, Makefile.in: Regenerate. From-SVN: r243112 --- libcilkrts/ChangeLog | 7 ++ libcilkrts/Makefile.am | 2 +- libcilkrts/Makefile.in | 4 +- libcilkrts/README | 104 +++++++++++++++++--- libcilkrts/configure | 58 ++++++++++-- libcilkrts/configure.ac | 14 ++- libcilkrts/configure.tgt | 6 +- libcilkrts/include/cilk/cilk_undocumented.h | 3 - libcilkrts/include/internal/cilk_version.h | 10 +- libcilkrts/include/internal/rev.mk | 2 +- libcilkrts/runtime/cilk-abi.c | 1 - libcilkrts/runtime/cilk_fiber-unix.cpp | 2 +- libcilkrts/runtime/cilk_fiber.h | 5 +- libcilkrts/runtime/config/arm/os-fence.h | 19 +++- libcilkrts/runtime/config/sparc/cilk-abi-vla.c | 115 +++++++++++++++++++++++ libcilkrts/runtime/config/sparc/os-fence.h | 64 +++++++++++++ libcilkrts/runtime/config/sparc/os-unix-sysdep.c | 115 +++++++++++++++++++++++ libcilkrts/runtime/except-gcc.cpp | 2 + libcilkrts/runtime/global_state.cpp | 1 - libcilkrts/runtime/global_state.h | 1 - libcilkrts/runtime/jmpbuf.h | 14 ++- libcilkrts/runtime/linux-symbols.ver | 1 - libcilkrts/runtime/local_state.h | 7 ++ libcilkrts/runtime/mac-symbols.txt | 1 - libcilkrts/runtime/os-unix.c | 5 +- libcilkrts/runtime/record-replay.cpp | 2 +- libcilkrts/runtime/scheduler.c | 18 +++- libcilkrts/runtime/scheduler.h | 3 + libcilkrts/runtime/sysdep-unix.c | 12 +-- 29 files changed, 530 insertions(+), 68 deletions(-) create mode 100644 libcilkrts/runtime/config/sparc/cilk-abi-vla.c create mode 100644 libcilkrts/runtime/config/sparc/os-fence.h create mode 100644 libcilkrts/runtime/config/sparc/os-unix-sysdep.c diff --git a/libcilkrts/ChangeLog b/libcilkrts/ChangeLog index 68ca82a..443f0a2 100644 --- a/libcilkrts/ChangeLog +++ b/libcilkrts/ChangeLog @@ -1,3 +1,10 @@ +2016-12-01 Rainer Orth + + PR target/68945 + Merge from upstream, version 2.0.4467.0. + Fix typo in git URL. + * aclocal.m4, configure, Makefile.in: Regenerate. + 2016-11-15 Matthias Klose * configure: Regenerate. diff --git a/libcilkrts/Makefile.am b/libcilkrts/Makefile.am index 3736a63..1dec6aa 100644 --- a/libcilkrts/Makefile.am +++ b/libcilkrts/Makefile.am @@ -54,7 +54,7 @@ GENERAL_FLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/runtime -I$(top_srcdir)/ # Enable Intel Cilk Plus extension GENERAL_FLAGS += -fcilkplus -# Always generate unwind tables +#Always generate unwind tables GENERAL_FLAGS += -funwind-tables AM_CFLAGS = $(XCFLAGS) $(GENERAL_FLAGS) -std=c99 diff --git a/libcilkrts/Makefile.in b/libcilkrts/Makefile.in index ff88e9d..6520b16 100644 --- a/libcilkrts/Makefile.in +++ b/libcilkrts/Makefile.in @@ -396,7 +396,7 @@ ACLOCAL_AMFLAGS = -I .. -I ../config # Enable Intel Cilk Plus extension -# Always generate unwind tables +#Always generate unwind tables GENERAL_FLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/runtime \ -I$(top_srcdir)/runtime/config/$(config_dir) \ -I$(top_srcdir)/runtime/sslib -DIN_CILK_RUNTIME=1 -fcilkplus \ @@ -455,7 +455,7 @@ libcilkrts_la_SOURCES = \ runtime/sysdep-unix.c \ runtime/worker_mutex.c -CILK_REVISION = 4420 +CILK_REVISION = 4467 # Load the $(REVISION) value. diff --git a/libcilkrts/README b/libcilkrts/README index d3503f4..54f8b04 100644 --- a/libcilkrts/README +++ b/libcilkrts/README @@ -1,14 +1,16 @@ -Intel(R) Cilk(TM) Plus runtime library +Intel(R) Cilk(TM) Plus Runtime Library Index: -1. BUILDING -2. USING -3. DOXYGEN DOCUMENTATION -4. QUESTIONS OR BUGS -5. CONTRIBUTIONS +1. BUILDING WITH AUTOMAKE +2. BUILDING WITH CMAKE +3. INSTALLING TO VXWORKS +4. USING +5. DOXYGEN DOCUMENTATION +6. QUESTIONS OR BUGS +7. CONTRIBUTIONS # -# 1. BUILDING: +# 1. BUILDING WITH AUTOMAKE: # To distribute applications that use the Intel Cilk Plus language @@ -40,22 +42,87 @@ configure script: % ./configure --prefix=/your/path/to/lib -It is also possible to use CMake if the above method does not apply -well in your environment. Instruction is available in CMakeLists.txt. +# +# 2. BUILDING WITH CMAKE: +# + +To distribute applications that use the Intel Cilk Plus language +extensions to non-development systems, you need to build the Intel +Cilk Plus runtime library and distribute it with your application. +This instruction describes the build process using CMake*, which +supports Linux*, Windows*, and OS X*. It is fine to use this process +to build a Linux library, but it is highly recommended to use the +more mature build process described above when building on Linux. + +You need the CMake tool and a C/C++ compiler that supports the Intel +Cilk Plus language extensions, and the requirements for each operating +systems are: + +Common: + CMake 3.0.0 or later + Make tools such as make (Linux, OS X) or nmake (Windows) +Linux: + GCC* 4.9.2 or later, or Intel(R) C++ Compiler v12.1 or later +Windows: + Intel C++ Compiler v12.1 or later + Visual Studio* 2010 or later +OS X: + Cilk-enabled branch of Clang*/LLVM* (http://cilkplus.github.io), + or Intel C++ Compiler v12.1 or later + +The common steps to build the libraries are 1) invoke cmake with +appropriate options, 2) invoke a make tool available on the system. +The following examples show build processes on OS X and Windows. + +OS X: + % mkdir ./build && cd ./build + % cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_INSTALL_PREFIX=./install .. + % make && make install + +Windows: + % mkdir .\build && cd .\build + % cmake -G "NMake Makefiles" -DCMAKE_C_COMPILER=icl \ + -DCMAKE_CXX_COMPILER=icl -DCMAKE_INSTALL_PREFIX=.\install .. + % nmake && nmake install + +# +# 3. INSTALLING TO VXWORKS OS +# + +For Windows host, run VxWorks_Install.bat. +For Linux host, run VxWorks_Install.sh. + +You may need to give environment variable WIND_BASE to indicate VxWorks +installation path. + +Create a VSB project, and you will see a layer named "CILKPLUS_KERNEL". +Enable it and build the project. + +Create a VIP project with ICC and add component INCLUDE_CILKPLUS, then you +will get support of Intel Cilk Plus features in VxWorks. # -# 2. USING: +# 4. USING: # The Intel(R) C++ Compiler will automatically try to bring in the Intel Cilk Plus runtime in any program that uses the relevant -features. GCC requires explicit linking of both the library and -its dependencies (libpthread, libdl). For example: +features. GCC and Clang requires an explicit compiler option, +-fcilkplus, to enable Intel Cilk Plus language extensions. +For example, + +% gcc -fcilkplus -o foo.exe foo.c +% clang -fcilkplus -o foo.exe foo.c + +Older GCC versions (e.g., 4.8 cilkplus branch) requires explicit linking +of both the library and its dependencies (libpthread, libdl). +For example: % gcc foo.c -lcilkrts -lpthread -ldl # -# 3. DOXYGEN DOCUMENTATION: +# 5. DOXYGEN DOCUMENTATION: # The library source has Doxygen markup. Generate HTML documentation @@ -64,7 +131,7 @@ based on the markup by changing directory into runtime and running: % doxygen doxygen.cfg # -# 4. QUESTIONS OR BUGS: +# 6. QUESTIONS OR BUGS: # Issues with the Intel Cilk Plus runtime can be addressed in the Intel @@ -72,7 +139,7 @@ Cilk Plus forums: http://software.intel.com/en-us/forums/intel-cilk-plus/ # -# 5. CONTRIBUTIONS: +# 7. CONTRIBUTIONS: # The Intel Cilk Plus runtime library is dual licensed. The upstream copy @@ -85,6 +152,13 @@ contributed to the upstream version via http://cilkplus.org/. Thanks to Tobias Burnus for showing us the magic to make gcc and g++ automatically include the Cilk Plus runtime. +Thanks to Eric Olson for sharing his patch for Raspberry Pi* with us. + +Thanks to Rainer Orth for submitting patches for exception handling and +enabling Cilk Plus on the SPARC* architecture. + ------------------------ Intel and Cilk are trademarks of Intel Corporation in the U.S. and/or other countries. + +*Other names and brands may be claimed as the property of others. diff --git a/libcilkrts/configure b/libcilkrts/configure index c04df2b..ecf88ee 100644 --- a/libcilkrts/configure +++ b/libcilkrts/configure @@ -5676,6 +5676,50 @@ _ACEOF fi +# Check for dl functions +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for dladdr in -ldl" >&5 +$as_echo_n "checking for dladdr in -ldl... " >&6; } +if test "${ac_cv_lib_dl_dladdr+set}" = set; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-ldl $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char dladdr (); +int +main () +{ +return dladdr (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_dl_dladdr=yes +else + ac_cv_lib_dl_dladdr=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_dl_dladdr" >&5 +$as_echo "$ac_cv_lib_dl_dladdr" >&6; } +if test "x$ac_cv_lib_dl_dladdr" = x""yes; then : + +$as_echo "#define HAVE_DLADDR 1" >>confdefs.h + +fi + + # Check whether the target supports protected visibility. save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS -Werror" @@ -5766,11 +5810,7 @@ esac # contains information on what's needed case "${target}" in - x86_64-*-*) - config_dir="x86" - ;; - - i?86-*-*) + i?86-*-* | x86_64-*-*) config_dir="x86" ;; @@ -5778,6 +5818,10 @@ case "${target}" in config_dir="arm" ;; + sparc*-*-*) + config_dir="sparc" + ;; + *) config_dir="generic" ;; @@ -11893,7 +11937,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 11896 "configure" +#line 11940 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -11999,7 +12043,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 12002 "configure" +#line 12046 "configure" #include "confdefs.h" #if HAVE_DLFCN_H diff --git a/libcilkrts/configure.ac b/libcilkrts/configure.ac index 39efeb4..8270e0b 100644 --- a/libcilkrts/configure.ac +++ b/libcilkrts/configure.ac @@ -69,6 +69,10 @@ AC_PROG_CXX AC_CONFIG_FILES([Makefile libcilkrts.spec]) AC_FUNC_ALLOCA +# Check for dl functions +AC_CHECK_LIB(dl, dladdr, + [AC_DEFINE(HAVE_DLADDR, 1, [Define if you have dladdr()])]) + # Check whether the target supports protected visibility. save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS -Werror" @@ -141,11 +145,7 @@ esac # contains information on what's needed case "${target}" in - x86_64-*-*) - config_dir="x86" - ;; - - i?86-*-*) + i?86-*-* | x86_64-*-*) config_dir="x86" ;; @@ -153,6 +153,10 @@ case "${target}" in config_dir="arm" ;; + sparc*-*-*) + config_dir="sparc" + ;; + *) config_dir="generic" ;; diff --git a/libcilkrts/configure.tgt b/libcilkrts/configure.tgt index 7f0befc..71f69b0 100644 --- a/libcilkrts/configure.tgt +++ b/libcilkrts/configure.tgt @@ -44,12 +44,12 @@ # Disable Cilk Runtime library for unsupported architectures. case "${target}" in - x86_64-*-*) - ;; - i?86-*-*) + i?86-*-* | x86_64-*-*) ;; arm-*-*) ;; + sparc*-*-*) + ;; *-*-*) UNSUPPORTED=1 ;; diff --git a/libcilkrts/include/cilk/cilk_undocumented.h b/libcilkrts/include/cilk/cilk_undocumented.h index 5f4a8c5..71a51ec 100644 --- a/libcilkrts/include/cilk/cilk_undocumented.h +++ b/libcilkrts/include/cilk/cilk_undocumented.h @@ -106,9 +106,6 @@ size_t __cilkrts_get_stack_size(void); CILK_EXPORT __CILKRTS_NOTHROW void __cilkrts_dump_stats(void); -CILK_EXPORT __CILKRTS_NOTHROW -int __cilkrts_irml_version(void); - struct __cilk_tbb_unwatch_thunk; struct __cilk_tbb_stack_op_thunk; diff --git a/libcilkrts/include/internal/cilk_version.h b/libcilkrts/include/internal/cilk_version.h index 95e1f2e..c997b02 100644 --- a/libcilkrts/include/internal/cilk_version.h +++ b/libcilkrts/include/internal/cilk_version.h @@ -49,10 +49,10 @@ #define VERSION_MAJOR 2 #define VERSION_MINOR 0 -#define VERSION_BUILD 4420 +#define VERSION_BUILD 4467 #define VERSION_REV 0 -#define VERSION_STRING "2,0,4420,0" -#define VERSION_HASH "3b2d6aa9059c" +#define VERSION_STRING "2,0,4467,0" +#define VERSION_HASH "b7e54d87bd17" #define VERSION_BRANCH "eng" -#define TBB_REV_NUMBER "14788" -#define VERSION_YEAR "2015" +#define TBB_REV_NUMBER "" +#define VERSION_YEAR "2016" diff --git a/libcilkrts/include/internal/rev.mk b/libcilkrts/include/internal/rev.mk index 96ffdc4..cd78865 100644 --- a/libcilkrts/include/internal/rev.mk +++ b/libcilkrts/include/internal/rev.mk @@ -49,4 +49,4 @@ # # It was automatically generated by cilkrts/include/internal/Makefile -CILK_REVISION = 4420 +CILK_REVISION = 4467 diff --git a/libcilkrts/runtime/cilk-abi.c b/libcilkrts/runtime/cilk-abi.c index 35bb413..8487873 100644 --- a/libcilkrts/runtime/cilk-abi.c +++ b/libcilkrts/runtime/cilk-abi.c @@ -709,7 +709,6 @@ __cilkrts_watch_stack(__cilk_tbb_unwatch_thunk *u, return 0; /* Success! */ } - // This function must be called only within a continuation, within the stack // frame of the continuation itself. CILK_API_INT __cilkrts_synched(void) diff --git a/libcilkrts/runtime/cilk_fiber-unix.cpp b/libcilkrts/runtime/cilk_fiber-unix.cpp index d59bfca..c38c49f 100644 --- a/libcilkrts/runtime/cilk_fiber-unix.cpp +++ b/libcilkrts/runtime/cilk_fiber-unix.cpp @@ -220,7 +220,7 @@ NORETURN cilk_fiber_sysdep::run() // enough extra space from the top of the stack we are // switching to for any temporaries required for this run() // function. - JMPBUF_SP(m_resume_jmpbuf) = m_stack_base - frame_size; + JMPBUF_SP(m_resume_jmpbuf) = CILK_ADJUST_SP(m_stack_base - frame_size); // GCC doesn't allow us to call __builtin_longjmp in the same function // that calls __builtin_setjmp, so it's been moved into it's own diff --git a/libcilkrts/runtime/cilk_fiber.h b/libcilkrts/runtime/cilk_fiber.h index d91687a..43057f2 100644 --- a/libcilkrts/runtime/cilk_fiber.h +++ b/libcilkrts/runtime/cilk_fiber.h @@ -73,9 +73,12 @@ * * A value of 0 means no debugging. * Higher values generate more debugging output. + * */ -#define FIBER_DEBUG 0 +#ifndef FIBER_DEBUG +#define FIBER_DEBUG 0 +#endif /** * @brief Flag for validating reference counts. * diff --git a/libcilkrts/runtime/config/arm/os-fence.h b/libcilkrts/runtime/config/arm/os-fence.h index 67e157a..779a2dc 100644 --- a/libcilkrts/runtime/config/arm/os-fence.h +++ b/libcilkrts/runtime/config/arm/os-fence.h @@ -47,6 +47,14 @@ * for your assistance in helping us improve Cilk Plus. **************************************************************************/ +// __atomic_* intrinsics are available since GCC 4.7. +#define HAVE_ATOMIC_INTRINSICS defined(__GNUC__) && \ + (__GNUC__ * 10 + __GNUC_MINOR__ >= 47) + +// GCC before 4.4 does not implement __sync_synchronize properly +#define HAVE_SYNC_INTRINSICS defined(__GNUC__) && \ + (__GNUC__ * 10 + __GNUC_MINOR__ >= 44) + /* * void __cilkrts_fence(void) * @@ -60,5 +68,12 @@ * the CPUID instruction). */ -// COMMON_SYSDEP void __cilkrts_fence(void); ///< MFENCE instruction -# define __cilkrts_fence() __asm__ __volatile__ ("mcr p15,0,%[t],c7,c10,4\n" :: [t] "r" (0) : "memory"); +#if HAVE_ATOMIC_INTRINSICS +# define __cilkrts_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST) +#elif HAVE_SYNC_INTRINSICS +# define __cilkrts_fence() __sync_synchronize() +#else +# define __cilkrts_fence() +// Leaving this code just in case. +//# define __cilkrts_fence() __asm__ __volatile__ ("mcr p15,0,%[t],c7,c10,4\n" :: [t] "r" (0) : "memory"); +#endif diff --git a/libcilkrts/runtime/config/sparc/cilk-abi-vla.c b/libcilkrts/runtime/config/sparc/cilk-abi-vla.c new file mode 100644 index 0000000..cf88d99 --- /dev/null +++ b/libcilkrts/runtime/config/sparc/cilk-abi-vla.c @@ -0,0 +1,115 @@ +/* cilk-abi-vla.cpp -*-C++-*- + * + ************************************************************************* + * + * Copyright (C) 2013-2016, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * ********************************************************************* + * + * PLEASE NOTE: This file is a downstream copy of a file mainitained in + * a repository at cilkplus.org. Changes made to this file that are not + * submitted through the contribution process detailed at + * http://www.cilkplus.org/submit-cilk-contribution will be lost the next + * time that a new version is released. Changes only submitted to the + * GNU compiler collection or posted to the git repository at + * https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are + * not tracked. + * + * We welcome your contributions to this open source project. Thank you + * for your assistance in helping us improve Cilk Plus. + * + **************************************************************************/ + +/* + * Implementation of Variable Length Array (VLA) ABI. + * + * The compiler calls these functions to allocate Variable Length Arrays + * at runtime. The compiler must guarantee that __cilkrts_stack_free() is + * called to cleanup any memory allocated by __cilkrts_stack_alloc(). + * + * This generic implementation always allocates the memory from the heap. + * Optimally, the implementation should expand the frame of the calling + * function if possible, since that will be faster. See the x86 version + * for one possible implementation. + */ + +#include +#include +#include + +#include "internal/abi.h" +#include "cilk-abi-vla-internal.h" + +// Allocate space for a variable length array +CILK_ABI(__cilkrts_void_ptr) +__cilkrts_stack_alloc( + __cilkrts_stack_frame *sf, + size_t size, + size_t distance_from_sp_to_alloca_area, + uint32_t align, // align is always >= minimum stack alignment and + // >= ptr_size as well, and must be a power of 2. + uint32_t needs_tag // non-zero if the pointer being returned needs to + // be tagged +) +{ + // full_size will be a multiple of align, and contains + // enough extra space to allocate a marker. + size_t full_size = (size + align - 1) & ~(align - 1); + + // Allocate memory from the heap. The compiler is responsible + // for guaranteeing us a chance to free it before the function + // exits + + return (void *)vla_internal_heap_alloc(sf, full_size, align); +} + +// Free the space allocated for a variable length array. +CILK_ABI(void) +__cilkrts_stack_free( + __cilkrts_stack_frame *sf, + void *p, + size_t size, + size_t distance_from_sp_to_alloca_area, + uint32_t align, // same requirements as for align in allocation, + // and must match alignment that was passed when + // doing the allocation + uint32_t known_from_stack // non-zero if this is known to be allocated + // on the stack, and therefore has no tag +) +{ + // full_size will be a multiple of align, and contains + // enough extra space to allocate a marker if one was needed. + size_t full_size = (size + align - 1) & ~(align - 1); + + // Just free the allocated memory to the heap since we don't know + // how to expand/contract the calling frame + vla_internal_heap_free(p, full_size); +} diff --git a/libcilkrts/runtime/config/sparc/os-fence.h b/libcilkrts/runtime/config/sparc/os-fence.h new file mode 100644 index 0000000..24e7993 --- /dev/null +++ b/libcilkrts/runtime/config/sparc/os-fence.h @@ -0,0 +1,64 @@ +/* os.h -*-C++-*- + * + ************************************************************************* + * + * Copyright (C) 2009-2016, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * ********************************************************************* + * + * PLEASE NOTE: This file is a downstream copy of a file mainitained in + * a repository at cilkplus.org. Changes made to this file that are not + * submitted through the contribution process detailed at + * http://www.cilkplus.org/submit-cilk-contribution will be lost the next + * time that a new version is released. Changes only submitted to the + * GNU compiler collection or posted to the git repository at + * https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are + * not tracked. + * + * We welcome your contributions to this open source project. Thank you + * for your assistance in helping us improve Cilk Plus. + **************************************************************************/ + +// GCC before 4.4 does not implement __sync_synchronize properly +#define HAVE_SYNC_INTRINSICS defined(__GNUC__) && \ + (__GNUC__ * 10 + __GNUC_MINOR__ >= 44) + +/* + * void __cilkrts_fence(void) + */ + +#if HAVE_SYNC_INTRINSICS +# define __cilkrts_fence() __sync_synchronize() +#elif defined(__GNUC__) +# define __cilkrts_fence() __asm__ volatile ("membar #StoreLoad" ::: "memory") +#else +COMMON_SYSDEP void __cilkrts_fence(void); +#endif diff --git a/libcilkrts/runtime/config/sparc/os-unix-sysdep.c b/libcilkrts/runtime/config/sparc/os-unix-sysdep.c new file mode 100644 index 0000000..997e9f6 --- /dev/null +++ b/libcilkrts/runtime/config/sparc/os-unix-sysdep.c @@ -0,0 +1,115 @@ +/* os-unix-sysdep.c -*-C-*- + * + ************************************************************************* + * + * Copyright (C) 2009-2016, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * ********************************************************************* + * + * PLEASE NOTE: This file is a downstream copy of a file mainitained in + * a repository at cilkplus.org. Changes made to this file that are not + * submitted through the contribution process detailed at + * http://www.cilkplus.org/submit-cilk-contribution will be lost the next + * time that a new version is released. Changes only submitted to the + * GNU compiler collection or posted to the git repository at + * https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are + * not tracked. + * + * We welcome your contributions to this open source project. Thank you + * for your assistance in helping us improve Cilk Plus. + ************************************************************************* + * + * This file contains system-specific code for sparc-based systems + */ + +#include "os.h" +#include "sysdep.h" + +/* + * The cycle counter is used for debugging. This function is only called if + * CILK_PROFILE is defined when the runtime is built. + */ +COMMON_SYSDEP unsigned long long __cilkrts_getticks(void) +{ + unsigned long long tick; +#ifdef __sparcv9 + __asm__ volatile("rd %%tick, %0" : "=r"(tick)); +#else + __asm__ volatile("rd %%tick, %L0\n" + "srlx %L0, 32, %H0" + : "=r"(tick)); +#endif + return tick; +} + +/* + * A "short pause" - called from the Cilk runtime's spinloops. + */ +COMMON_SYSDEP void __cilkrts_short_pause(void) +{ + /* Spin around for 8 cycles. */ + __asm__ volatile("rd %ccr, %g0"); + __asm__ volatile("rd %ccr, %g0"); + __asm__ volatile("rd %ccr, %g0"); + __asm__ volatile("rd %ccr, %g0"); +} + +/* + * Interlocked exchange - used to implement the Cilk runtime's spinloops + */ +COMMON_SYSDEP int __cilkrts_xchg(volatile int *ptr, int x) +{ + x = __sync_lock_test_and_set(ptr, x); + return x; +} + + +/* + * Restore the floating point state that is stored in a stack frame at each + * spawn. This should be called each time a frame is resumed. + * + * Only valid for IA32 and Intel64 processors. + */ +void restore_x86_fp_state (__cilkrts_stack_frame *sf) +{ +} + + +/* + * Save the floating point state to the __cilkrts_stack_frame at each spawn. + * + * Architecture-specific - Should only be needed on IA32 and Intel64 + * processors. + */ +void sysdep_save_fp_ctrl_state(__cilkrts_stack_frame *sf) +{ +} + diff --git a/libcilkrts/runtime/except-gcc.cpp b/libcilkrts/runtime/except-gcc.cpp index 4940acb..7fc6e6b 100644 --- a/libcilkrts/runtime/except-gcc.cpp +++ b/libcilkrts/runtime/except-gcc.cpp @@ -60,7 +60,9 @@ #include #include +#ifndef DEBUG_EXCEPTIONS #define DEBUG_EXCEPTIONS 0 +#endif struct pending_exception_info { diff --git a/libcilkrts/runtime/global_state.cpp b/libcilkrts/runtime/global_state.cpp index 2af6697..6c77b5f 100644 --- a/libcilkrts/runtime/global_state.cpp +++ b/libcilkrts/runtime/global_state.cpp @@ -579,7 +579,6 @@ global_state_t* cilkg_init_global_state() g->stack_size = cilkos_validate_stack_size(g->stack_size); g->failure_to_allocate_stack = 0; - return g; } diff --git a/libcilkrts/runtime/global_state.h b/libcilkrts/runtime/global_state.h index 527a4b5..f5937b8 100644 --- a/libcilkrts/runtime/global_state.h +++ b/libcilkrts/runtime/global_state.h @@ -215,7 +215,6 @@ struct global_state_t { /* COMMON_PORTABLE */ /// Global fiber pool cilk_fiber_pool fiber_pool; - /** * @brief Track whether the runtime has failed to allocate a * stack. diff --git a/libcilkrts/runtime/jmpbuf.h b/libcilkrts/runtime/jmpbuf.h index 0ce7ff8..9ae2fd2 100644 --- a/libcilkrts/runtime/jmpbuf.h +++ b/libcilkrts/runtime/jmpbuf.h @@ -107,6 +107,18 @@ */ #define SP(SF) JMPBUF_SP((SF)->ctx) +/** + * @brief Some architecture-dependent stack adjustment. + */ +#if defined(__sparcv9) + // Subtract sparc v9 stack bias so the actual stack starts at the + // allocated area. +# define CILK_ADJUST_SP(SP) ((SP) - 2047) +# define CILK_UNADJUST_SP(SP) ((SP) + 2047) +#else +# define CILK_ADJUST_SP(SP) (SP) +# define CILK_UNADJUST_SP(SP) (SP) +#endif __CILKRTS_BEGIN_EXTERN_C @@ -120,7 +132,7 @@ __CILKRTS_BEGIN_EXTERN_C */ inline char *__cilkrts_get_sp(__cilkrts_stack_frame *sf) { - return (char *)SP(sf); + return (char *)CILK_UNADJUST_SP(SP(sf)); } /** diff --git a/libcilkrts/runtime/linux-symbols.ver b/libcilkrts/runtime/linux-symbols.ver index d656842..57b4e8a 100644 --- a/libcilkrts/runtime/linux-symbols.ver +++ b/libcilkrts/runtime/linux-symbols.ver @@ -71,7 +71,6 @@ CILKABI0 __cilkrts_hyperobject_dealloc; __cilkrts_hyperobject_noop_destroy; __cilkrts_init; - __cilkrts_irml_version; __cilkrts_leave_frame; __cilkrts_metacall; __cilkrts_rethrow; diff --git a/libcilkrts/runtime/local_state.h b/libcilkrts/runtime/local_state.h index d16599f..9b28685 100644 --- a/libcilkrts/runtime/local_state.h +++ b/libcilkrts/runtime/local_state.h @@ -360,6 +360,13 @@ struct local_state /* COMMON_PORTABLE */ unsigned int steal_failure_count; /** + * Record indicating that the worker stole work at least once. + * + * [local read/write] + */ + int has_stolen; + + /** * 1 if work was stolen from another worker. When true, this will flag * setup_for_execution_pedigree to increment the pedigree when we resume * execution to match the increment that would have been done on a return diff --git a/libcilkrts/runtime/mac-symbols.txt b/libcilkrts/runtime/mac-symbols.txt index efec3e9..29bf481 100644 --- a/libcilkrts/runtime/mac-symbols.txt +++ b/libcilkrts/runtime/mac-symbols.txt @@ -33,7 +33,6 @@ ___cilkrts_hyperobject_alloc ___cilkrts_hyperobject_dealloc ___cilkrts_hyperobject_noop_destroy ___cilkrts_init - ___cilkrts_leave_frame ___cilkrts_metacall ___cilkrts_resume diff --git a/libcilkrts/runtime/os-unix.c b/libcilkrts/runtime/os-unix.c index d339daf..c419fb6 100644 --- a/libcilkrts/runtime/os-unix.c +++ b/libcilkrts/runtime/os-unix.c @@ -432,7 +432,9 @@ COMMON_SYSDEP void __cilkrts_idle(void) #elif defined(__MIC__) _mm_delay_32(1024); #elif defined(__linux__) || \ - defined(__APPLE__) + defined(__APPLE__) || \ + defined(__CYGWIN__) + usleep(10000); #else # error "Unsupported architecture" @@ -452,6 +454,7 @@ COMMON_SYSDEP void __cilkrts_yield(void) { #if defined(__ANDROID__) || \ defined(__APPLE__) || \ + defined(__CYGWIN__) || \ defined(__FreeBSD__) || \ defined(__VXWORKS__) || \ (defined(__sun__) && defined(__svr4__)) diff --git a/libcilkrts/runtime/record-replay.cpp b/libcilkrts/runtime/record-replay.cpp index 293c99a..d92d28f 100644 --- a/libcilkrts/runtime/record-replay.cpp +++ b/libcilkrts/runtime/record-replay.cpp @@ -262,7 +262,7 @@ char * walk_pedigree_nodes(char *p, const __cilkrts_pedigree *pnode) if (pnode->parent) { p = walk_pedigree_nodes(p, pnode->parent); - p += cilk_snprintf_s(p, PEDIGREE_BUFF_SIZE, "%s", (char *) "_"); + p += cilk_snprintf_s(p, PEDIGREE_BUFF_SIZE, "%s", (char *)"_"); } return p + cilk_snprintf_l(p, PEDIGREE_BUFF_SIZE, "%" PRIu64, pnode->rank); } diff --git a/libcilkrts/runtime/scheduler.c b/libcilkrts/runtime/scheduler.c index 538c431..82c9e02 100644 --- a/libcilkrts/runtime/scheduler.c +++ b/libcilkrts/runtime/scheduler.c @@ -1789,20 +1789,27 @@ static full_frame* check_for_work(__cilkrts_worker *w) if (NULL == ff) { // Punish the worker for failing to steal. // No quantum for you! - if (w->l->steal_failure_count > 30000) { - // Punish more if the worker has been doing unsuccessful steals - // for a long time. After return from the idle state, it will - // be given a grace period to react quickly. + unsigned int max_fails = w->g->max_steal_failures << 1; + if (w->l->has_stolen == 0 && + w->l->steal_failure_count % max_fails == max_fails - 1) { + // Idle briefly if the worker has never stolen anything for + // the given grace period __cilkrts_idle(); - w->l->steal_failure_count -= 300; } else { __cilkrts_yield(); } w->l->steal_failure_count++; + if (w->l->steal_failure_count > (max_fails << 8)) { + // Reset the flag after certain amount of failures + // - This will reduce cpu time in top-level synched regions + // - max_fails can be controlled by user (CILK_STEAL_FAILURES) + w->l->has_stolen = 0; + } } else { // Reset steal_failure_count since there is obviously still work to // be done. w->l->steal_failure_count = 0; + w->l->has_stolen = 1; } } return ff; @@ -2912,6 +2919,7 @@ __cilkrts_worker *make_worker(global_state_t *g, w->l->stats = NULL; #endif w->l->steal_failure_count = 0; + w->l->has_stolen = 0; w->l->work_stolen = 0; diff --git a/libcilkrts/runtime/scheduler.h b/libcilkrts/runtime/scheduler.h index 74c4509..9546001 100644 --- a/libcilkrts/runtime/scheduler.h +++ b/libcilkrts/runtime/scheduler.h @@ -85,7 +85,10 @@ __CILKRTS_BEGIN_EXTERN_C * Print debugging messages and assertions for parallel reducers. 0 is * no debugging. A higher value generates more output. */ + +#ifndef REDPAR_DEBUG #define REDPAR_DEBUG 0 +#endif /** * @brief Lock the worker mutex to allow exclusive access to the diff --git a/libcilkrts/runtime/sysdep-unix.c b/libcilkrts/runtime/sysdep-unix.c index 611934a..0b99008 100644 --- a/libcilkrts/runtime/sysdep-unix.c +++ b/libcilkrts/runtime/sysdep-unix.c @@ -465,7 +465,7 @@ char* sysdep_reset_jump_buffers_for_resume(cilk_fiber* fiber, CILK_ASSERT(fiber); void* sp = (void*)get_sp_for_executing_sf(cilk_fiber_get_stack_base(fiber), ff, sf); - SP(sf) = sp; + SP(sf) = CILK_ADJUST_SP(sp); /* Debugging: make sure stack is accessible. */ ((volatile char *)sp)[-1]; @@ -495,7 +495,7 @@ NORETURN sysdep_longjmp_to_sf(char* new_sp, #endif // Set the stack pointer. - SP(sf) = new_sp; + SP(sf) = CILK_ADJUST_SP(new_sp); #ifdef RESTORE_X86_FP_STATE if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1) { @@ -568,7 +568,7 @@ static const char *get_runtime_path () { // dladdr is a glibc extension. If it's available, use it to find the path // for libcilkrts.so -#ifdef _GNU_SOURCE +#if HAVE_DLADDR Dl_info info; if (0 != dladdr(dummy_function, &info)) return info.dli_fname; @@ -689,7 +689,6 @@ static void write_version_file (global_state_t *g, int n) // ================== // System cores: 8 // Cilk workers requested: 8 - // Thread creator: Private fprintf(fp, "\nThread information\n"); fprintf(fp, "==================\n"); @@ -699,11 +698,6 @@ static void write_version_file (global_state_t *g, int n) fprintf(fp, "System cores: %d\n", (int)sysconf(_SC_NPROCESSORS_ONLN)); #endif fprintf(fp, "Cilk workers requested: %d\n", n); -#if (PARALLEL_THREAD_CREATE) - fprintf(fp, "Thread creator: Private (parallel)\n"); -#else - fprintf(fp, "Thread creator: Private\n"); -#endif if (fp != stderr && fp != stdout) fclose(fp); else fflush(fp); // flush the handle buffer if it is stdout or stderr. -- cgit v1.1 From 56e1a4d7127256bb3f476a6d93954b1948b03985 Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Thu, 1 Dec 2016 14:59:03 +0000 Subject: Fix PR tree-optimization/78598 - tree-ssa-loop-prefetch.c:835:16: runtime error: signed integer overflow Using bootstrap-ubsan gcc to build mplayer shows: tree-ssa-loop-prefetch.c:835:16: runtime error: signed integer overflow: 288230376151711743 * 64 cannot be represented in type 'long int' Here signed und unsigned integers are mixed in a division resulting in bogus values: (-83 + 64ULL -1) / 64ULL) == 288230376151711743 Fixed by casting the unsigned parameter to signed. PR tree-optimization/78598 * tree-ssa-loop-prefetch.c (ddown): Cast to signed to avoid overflows. From-SVN: r243113 --- gcc/ChangeLog | 6 ++++++ gcc/tree-ssa-loop-prefetch.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b90cbc6..17e9831 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,11 @@ 2016-12-01 Markus Trippelsdorf + PR tree-optimization/78598 + * tree-ssa-loop-prefetch.c (ddown): Cast to signed to avoid + overflows. + +2016-12-01 Markus Trippelsdorf + PR rtl-optimization/78596 * combine.c (simplify_comparison): Cast to unsigned to avoid left shifting of negative value. diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c index 0a2ee5e..ead2543 100644 --- a/gcc/tree-ssa-loop-prefetch.c +++ b/gcc/tree-ssa-loop-prefetch.c @@ -700,9 +700,9 @@ ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by) gcc_assert (by > 0); if (x >= 0) - return x / by; + return x / (HOST_WIDE_INT) by; else - return (x + by - 1) / by; + return (x + (HOST_WIDE_INT) by - 1) / (HOST_WIDE_INT) by; } /* Given a CACHE_LINE_SIZE and two inductive memory references -- cgit v1.1 From d95fe8017d40a6f0df671fd3e6a5fdc5b1d5319d Mon Sep 17 00:00:00 2001 From: James Greenhalgh Date: Thu, 1 Dec 2016 15:33:29 +0000 Subject: [Patch testsuite obvious] Use setjmp, not sigsetjmp in gcc.dg/pr78582.c gcc/testsuite/ * gcc.dg/pr78582.c (main): Call setjmp, not sigsetjmp. From-SVN: r243116 --- gcc/testsuite/ChangeLog | 4 ++++ gcc/testsuite/gcc.dg/pr78582.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 447d9fb..2a04091 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2016-12-01 James Greenhalgh + + * gcc.dg/pr78582.c (main): Call setjmp, not sigsetjmp. + 2016-12-01 Richard Biener * gcc.dg/torture/alias-2.c: New testcase. diff --git a/gcc/testsuite/gcc.dg/pr78582.c b/gcc/testsuite/gcc.dg/pr78582.c index 3084e3b..5284e3f 100644 --- a/gcc/testsuite/gcc.dg/pr78582.c +++ b/gcc/testsuite/gcc.dg/pr78582.c @@ -10,7 +10,7 @@ int main (int argc, char argv, char env) { int a; - sigsetjmp (0, 0); + setjmp (0); argc = a = argc; reader_loop (); -- cgit v1.1 From a9c21e2a4f44175eab8588d794a3ea3ce2fa8d0b Mon Sep 17 00:00:00 2001 From: Georg-Johann Lay Date: Thu, 1 Dec 2016 15:56:58 +0000 Subject: avr.c: Fix coding rule glitches. * config/avr/avr.c: Fix coding rule glitches. From-SVN: r243118 --- gcc/ChangeLog | 4 ++ gcc/config/avr/avr.c | 152 +++++++++++++++++++++++++-------------------------- 2 files changed, 80 insertions(+), 76 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 17e9831..5b2570b 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,7 @@ +2016-12-01 Georg-Johann Lay + + * config/avr/avr.c: Fix coding rule glitches. + 2016-12-01 Markus Trippelsdorf PR tree-optimization/78598 diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c index db3c55f..b7fd8798 100644 --- a/gcc/config/avr/avr.c +++ b/gcc/config/avr/avr.c @@ -388,7 +388,7 @@ avr_parallel_insn_from_insns (rtx_insn *i[6]) If this is the case, fill in the insns from casesi to INSNS[1..5] and the SImode extension to INSNS[0]. Moreover, extract the operands of pattern casesi__sequence forged from the sequence to recog_data. */ - + static bool avr_is_casesi_sequence (basic_block bb, rtx_insn *insn, rtx_insn *insns[6]) { @@ -702,7 +702,7 @@ avr_set_core_architecture (void) break; } else if (0 == strcmp (mcu->name, avr_mmcu) - // Is this a proper architecture ? + // Is this a proper architecture ? && NULL == mcu->macro) { avr_arch = &avr_arch_types[mcu->arch_id]; @@ -1078,7 +1078,7 @@ avr_set_current_function (tree decl) if (!STR_PREFIX_P (name, "__vector")) warning_at (loc, OPT_Wmisspelled_isr, "%qs appears to be a misspelled " - "%s handler, missing __vector prefix", name, isr); + "%s handler, missing __vector prefix", name, isr); } /* Don't print the above diagnostics more than once. */ @@ -1163,7 +1163,7 @@ avr_regs_to_save (HARD_REG_SET *set) /* Don't record frame pointer registers here. They are treated indivitually in prologue. */ && !(frame_pointer_needed - && (reg == REG_Y || reg == (REG_Y+1))))) + && (reg == REG_Y || reg == REG_Y + 1)))) { if (set) SET_HARD_REG_BIT (*set, reg); @@ -1374,7 +1374,7 @@ sequent_regs_live (void) else cur_seq = 0; - if (df_regs_ever_live_p (REG_Y+1)) + if (df_regs_ever_live_p (REG_Y + 1)) { ++live_seq; ++cur_seq; @@ -1807,7 +1807,8 @@ avr_expand_prologue (void) avr_prologue_setup_frame (size, set); if (flag_stack_usage_info) - current_function_static_stack_size = cfun->machine->stack_usage + INCOMING_FRAME_SP_OFFSET; + current_function_static_stack_size + = cfun->machine->stack_usage + INCOMING_FRAME_SP_OFFSET; } @@ -1840,9 +1841,9 @@ avr_asm_function_end_prologue (FILE *file) avr_outgoing_args_size()); fprintf (file, "/* frame size = " HOST_WIDE_INT_PRINT_DEC " */\n", - get_frame_size()); + get_frame_size()); fprintf (file, "/* stack size = %d */\n", - cfun->machine->stack_usage); + cfun->machine->stack_usage); /* Create symbol stack offset here so all functions have it. Add 1 to stack usage for offset so that SP + .L__stack_offset = return address. */ fprintf (file, ".L__stack_usage = %d\n", cfun->machine->stack_usage); @@ -2522,7 +2523,7 @@ avr_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr) rtx x = addr; if (GET_CODE (x) == CONST) x = XEXP (x, 0); - if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x,1))) + if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1))) { /* Assembler gs() will implant word address. Make offset a byte offset inside gs() for assembler. This is @@ -2532,14 +2533,14 @@ avr_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr) from symbol which may not be what the user really wanted. */ fprintf (file, "gs("); - output_addr_const (file, XEXP (x,0)); + output_addr_const (file, XEXP (x, 0)); fprintf (file, "+" HOST_WIDE_INT_PRINT_DEC ")", 2 * INTVAL (XEXP (x, 1))); if (AVR_3_BYTE_PC) if (warning (0, "pointer offset from symbol maybe incorrect")) { output_addr_const (stderr, addr); - fprintf(stderr,"\n"); + fprintf (stderr, "\n"); } } else @@ -2617,12 +2618,12 @@ avr_print_operand (FILE *file, rtx x, int code) } else if (code == 'E' || code == 'F') { - rtx op = XEXP(x, 0); + rtx op = XEXP (x, 0); fprintf (file, "%s", reg_names[REGNO (op) + ef]); } else if (code == 'I' || code == 'J') { - rtx op = XEXP(XEXP(x, 0), 0); + rtx op = XEXP (XEXP (x, 0), 0); fprintf (file, "%s", reg_names[REGNO (op) + ij]); } else if (REG_P (x)) @@ -2714,12 +2715,12 @@ avr_print_operand (FILE *file, rtx x, int code) } else if (GET_CODE (addr) == PLUS) { - avr_print_operand_address (file, VOIDmode, XEXP (addr,0)); + avr_print_operand_address (file, VOIDmode, XEXP (addr, 0)); if (REGNO (XEXP (addr, 0)) == REG_X) fatal_insn ("internal compiler error. Bad address:" ,addr); fputc ('+', file); - avr_print_operand (file, XEXP (addr,1), code); + avr_print_operand (file, XEXP (addr, 1), code); } else avr_print_operand_address (file, VOIDmode, addr); @@ -2753,7 +2754,7 @@ avr_print_operand (FILE *file, rtx x, int code) code); fprintf (file, HOST_WIDE_INT_PRINT_DEC, ival); } - else if (GET_CODE (x) == CONST_DOUBLE) + else if (CONST_DOUBLE_P (x)) { long val; if (GET_MODE (x) != SFmode) @@ -2781,15 +2782,15 @@ avr_print_operand (FILE *file, rtx x, int code) static bool avr_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size, - unsigned int align ATTRIBUTE_UNUSED, - enum by_pieces_operation op, - bool speed_p) + unsigned int align ATTRIBUTE_UNUSED, + enum by_pieces_operation op, + bool speed_p) { - - if (op != MOVE_BY_PIECES || (speed_p && (size > (MOVE_MAX_PIECES)))) + if (op != MOVE_BY_PIECES + || (speed_p && size > MOVE_MAX_PIECES)) return default_use_by_pieces_infrastructure_p (size, align, op, speed_p); - return size <= (MOVE_MAX_PIECES); + return size <= MOVE_MAX_PIECES; } @@ -2951,9 +2952,9 @@ avr_jump_mode (rtx x, rtx_insn *insn) int cur_addr = INSN_ADDRESSES (INSN_UID (insn)); int jump_distance = cur_addr - dest_addr; - if (-63 <= jump_distance && jump_distance <= 62) + if (IN_RANGE (jump_distance, -63, 62)) return 1; - else if (-2046 <= jump_distance && jump_distance <= 2045) + else if (IN_RANGE (jump_distance, -2046, 2045)) return 2; else if (AVR_HAVE_JMP_CALL) return 3; @@ -3113,9 +3114,9 @@ avr_simplify_comparison_p (machine_mode mode, RTX_CODE op, rtx x) register in which function arguments are sometimes passed. */ int -avr_function_arg_regno_p(int r) +avr_function_arg_regno_p (int r) { - return (AVR_TINY ? r >= 20 && r <= 25 : r >= 8 && r <= 25); + return AVR_TINY ? IN_RANGE (r, 20, 25) : IN_RANGE (r, 8, 25); } @@ -3526,8 +3527,8 @@ avr_out_lpm (rtx_insn *insn, rtx *op, int *plen) } else { - avr_asm_len ("mov %5,%2" CR_TAB - "ldi %2,%4" CR_TAB + avr_asm_len ("mov %5,%2" CR_TAB + "ldi %2,%4" CR_TAB "out %i6,%2" CR_TAB "mov %2,%5", xop, plen, 4); } @@ -3595,7 +3596,7 @@ avr_out_lpm (rtx_insn *insn, rtx *op, int *plen) if (REGNO (dest) == REG_Z - 2) return avr_asm_len ("%4lpm %5,%a2+" CR_TAB - "%4lpm %C0,%a2" CR_TAB + "%4lpm %C0,%a2" CR_TAB "mov %D0,%5", xop, plen, 3); else { @@ -3801,8 +3802,8 @@ avr_out_movqi_r_mr_reg_disp_tiny (rtx_insn *insn, rtx op[], int *plen) avr_asm_len (TINY_ADIW (%I1, %J1, %o1) CR_TAB "ld %0,%b1" , op, plen, -3); - if (!reg_overlap_mentioned_p (dest, XEXP (x,0)) - && !reg_unused_after (insn, XEXP (x,0))) + if (!reg_overlap_mentioned_p (dest, XEXP (x, 0)) + && !reg_unused_after (insn, XEXP (x, 0))) avr_asm_len (TINY_SBIW (%I1, %J1, %o1), op, plen, 2); return ""; @@ -3858,8 +3859,8 @@ out_movqi_r_mr (rtx_insn *insn, rtx op[], int *plen) avr_asm_len ("adiw r26,%o1" CR_TAB "ld %0,X", op, plen, -2); - if (!reg_overlap_mentioned_p (dest, XEXP (x,0)) - && !reg_unused_after (insn, XEXP (x,0))) + if (!reg_overlap_mentioned_p (dest, XEXP (x, 0)) + && !reg_unused_after (insn, XEXP (x, 0))) { avr_asm_len ("sbiw r26,%o1", op, plen, 1); } @@ -3891,7 +3892,7 @@ avr_out_movhi_r_mr_reg_no_disp_tiny (rtx_insn *insn, rtx op[], int *plen) "ld %B0,%1" CR_TAB "mov %A0,__tmp_reg__", op, plen, -3); - avr_asm_len ("ld %A0,%1+" CR_TAB + avr_asm_len ("ld %A0,%1+" CR_TAB "ld %B0,%1", op, plen, -2); if (!reg_unused_after (insn, base)) @@ -4228,12 +4229,12 @@ out_movsi_r_mr (rtx_insn *insn, rtx op[], int *l) "ld %D0,X" CR_TAB "mov %C0,__tmp_reg__"); else if (reg_unused_after (insn, base)) - return *l=4, ("ld %A0,X+" CR_TAB + return *l=4, ("ld %A0,X+" CR_TAB "ld %B0,X+" CR_TAB "ld %C0,X+" CR_TAB "ld %D0,X"); else - return *l=5, ("ld %A0,X+" CR_TAB + return *l=5, ("ld %A0,X+" CR_TAB "ld %B0,X+" CR_TAB "ld %C0,X+" CR_TAB "ld %D0,X" CR_TAB @@ -4873,7 +4874,7 @@ avr_out_load_psi (rtx_insn *insn, rtx *op, int *plen) return avr_asm_len ("subi r28,lo8(-%o1)" CR_TAB "sbci r29,hi8(-%o1)" CR_TAB - "ld %A0,Y" CR_TAB + "ld %A0,Y" CR_TAB "ldd %B0,Y+1" CR_TAB "ldd %C0,Y+2" CR_TAB "subi r28,lo8(%o1)" CR_TAB @@ -5196,7 +5197,7 @@ avr_out_movqi_mr_r_reg_disp_tiny (rtx_insn *insn, rtx op[], int *plen) "st %b0,%1", op, plen, -3); } - if (!reg_unused_after (insn, XEXP (x,0))) + if (!reg_unused_after (insn, XEXP (x, 0))) avr_asm_len (TINY_SBIW (%I0, %J0, %o0), op, plen, 2); return ""; @@ -5243,7 +5244,7 @@ out_movqi_mr_r (rtx_insn *insn, rtx op[], int *plen) "subi r28,lo8(%o0)" CR_TAB "sbci r29,hi8(%o0)", op, plen, -5); } - else if (REGNO (XEXP (x,0)) == REG_X) + else if (REGNO (XEXP (x, 0)) == REG_X) { if (reg_overlap_mentioned_p (src, XEXP (x, 0))) { @@ -5257,7 +5258,7 @@ out_movqi_mr_r (rtx_insn *insn, rtx op[], int *plen) "st X,%1", op, plen, -2); } - if (!reg_unused_after (insn, XEXP (x,0))) + if (!reg_unused_after (insn, XEXP (x, 0))) avr_asm_len ("sbiw r26,%o0", op, plen, 1); return ""; @@ -5403,7 +5404,7 @@ avr_out_movhi_mr_r_reg_no_disp_tiny (rtx_insn *insn, rtx op[], int *plen) "st %0,__tmp_reg__", op, plen, -5) : avr_asm_len ("mov __tmp_reg__,%B1" CR_TAB TINY_ADIW (%E0, %F0, 1) CR_TAB - "st %0,__tmp_reg__" CR_TAB + "st %0,__tmp_reg__" CR_TAB TINY_SBIW (%E0, %F0, 1) CR_TAB "st %0, %A1", op, plen, -7); } @@ -6200,9 +6201,9 @@ ashlhi3_out (rtx_insn *insn, rtx operands[], int *len) return ("swap %A0" CR_TAB "swap %B0" CR_TAB "ldi %3,0xf0" CR_TAB - "and %B0,%3" CR_TAB + "and %B0,%3" CR_TAB "eor %B0,%A0" CR_TAB - "and %A0,%3" CR_TAB + "and %A0,%3" CR_TAB "eor %B0,%A0"); } break; /* optimize_size ? 6 : 8 */ @@ -6230,9 +6231,9 @@ ashlhi3_out (rtx_insn *insn, rtx operands[], int *len) "swap %A0" CR_TAB "swap %B0" CR_TAB "ldi %3,0xf0" CR_TAB - "and %B0,%3" CR_TAB + "and %B0,%3" CR_TAB "eor %B0,%A0" CR_TAB - "and %A0,%3" CR_TAB + "and %A0,%3" CR_TAB "eor %B0,%A0"); } break; /* 10 */ @@ -6344,7 +6345,7 @@ ashlhi3_out (rtx_insn *insn, rtx operands[], int *len) if (AVR_HAVE_MUL) { *len = 6; - return ("set" CR_TAB + return ("set" CR_TAB "bld r1,5" CR_TAB "mul %A0,r1" CR_TAB "mov %B0,r0" CR_TAB @@ -7095,9 +7096,9 @@ lshrhi3_out (rtx_insn *insn, rtx operands[], int *len) return ("swap %B0" CR_TAB "swap %A0" CR_TAB "ldi %3,0x0f" CR_TAB - "and %A0,%3" CR_TAB + "and %A0,%3" CR_TAB "eor %A0,%B0" CR_TAB - "and %B0,%3" CR_TAB + "and %B0,%3" CR_TAB "eor %A0,%B0"); } break; /* optimize_size ? 6 : 8 */ @@ -7125,9 +7126,9 @@ lshrhi3_out (rtx_insn *insn, rtx operands[], int *len) "swap %B0" CR_TAB "swap %A0" CR_TAB "ldi %3,0x0f" CR_TAB - "and %A0,%3" CR_TAB + "and %A0,%3" CR_TAB "eor %A0,%B0" CR_TAB - "and %B0,%3" CR_TAB + "and %B0,%3" CR_TAB "eor %A0,%B0"); } break; /* 10 */ @@ -7239,7 +7240,7 @@ lshrhi3_out (rtx_insn *insn, rtx operands[], int *len) if (AVR_HAVE_MUL) { *len = 6; - return ("set" CR_TAB + return ("set" CR_TAB "bld r1,3" CR_TAB "mul %B0,r1" CR_TAB "mov %A0,r1" CR_TAB @@ -7575,7 +7576,7 @@ avr_out_plus_1 (rtx *xop, int *plen, enum rtx_code code, int *pcc, where this must be done is when NEG overflowed in case [2s] because the V computation needs the right sign of the subtrahend. */ - rtx msb = simplify_gen_subreg (QImode, xop[0], mode, n_bytes-1); + rtx msb = simplify_gen_subreg (QImode, xop[0], mode, n_bytes - 1); avr_asm_len ("subi %0,128" CR_TAB "brmi 0f", &msb, plen, 2); @@ -8257,9 +8258,9 @@ avr_out_sign_extend (rtx_insn *insn, rtx *xop, int *plen) avr_asm_len ("mov __tmp_reg__,%0", &r_msb, plen, 1); r_msb = tmp_reg_rtx; } - + avr_asm_len ("lsl %0", &r_msb, plen, 1); - + // ...and propagate it to all the new sign bits for (unsigned n = n_src; n < n_dest; n++) @@ -8374,7 +8375,7 @@ avr_out_insert_notbit (rtx_insn *insn, rtx operands[], rtx xbitno, int *plen) avr_asm_len ("bld %0,%1", op, plen, 1); } - + return ""; } @@ -9236,7 +9237,7 @@ int reg_unused_after (rtx_insn *insn, rtx reg) { return (dead_or_set_p (insn, reg) - || (REG_P(reg) && _reg_unused_after (insn, reg))); + || (REG_P (reg) && _reg_unused_after (insn, reg))); } /* Return nonzero if REG is not used after INSN. @@ -9253,7 +9254,7 @@ _reg_unused_after (rtx_insn *insn, rtx reg) case. Disregard the case where this is a store to memory, since we are checking a register used in the store address. */ set = single_set (insn); - if (set && GET_CODE (SET_DEST (set)) != MEM + if (set && !MEM_P (SET_DEST (set)) && reg_overlap_mentioned_p (reg, SET_DEST (set))) return 1; @@ -9305,7 +9306,7 @@ _reg_unused_after (rtx_insn *insn, rtx reg) return 0; if (set && reg_overlap_mentioned_p (reg, SET_DEST (set))) { - if (GET_CODE (SET_DEST (set)) != MEM) + if (!MEM_P (SET_DEST (set))) retval = 1; else return 0; @@ -9337,7 +9338,7 @@ _reg_unused_after (rtx_insn *insn, rtx reg) if (set && reg_overlap_mentioned_p (reg, SET_SRC (set))) return 0; if (set && reg_overlap_mentioned_p (reg, SET_DEST (set))) - return GET_CODE (SET_DEST (set)) != MEM; + return !MEM_P (SET_DEST (set)); if (set == 0 && reg_overlap_mentioned_p (reg, PATTERN (insn))) return 0; } @@ -9640,7 +9641,7 @@ avr_attribute_table[] = /* Return true if we support address space AS for the architecture in effect and false, otherwise. If LOC is not UNKNOWN_LOCATION then also issue a respective error. */ - + bool avr_addr_space_supported_p (addr_space_t as, location_t loc) { @@ -10582,7 +10583,7 @@ avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, *total += avr_operand_rtx_cost (XEXP (x, 1), mode, code, 1, speed); } - else if (INTVAL (XEXP (x, 1)) >= -63 && INTVAL (XEXP (x, 1)) <= 63) + else if (IN_RANGE (INTVAL (XEXP (x, 1)), -63, 63)) *total = COSTS_N_INSNS (1); else *total = COSTS_N_INSNS (2); @@ -10595,7 +10596,7 @@ avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, *total += avr_operand_rtx_cost (XEXP (x, 1), mode, code, 1, speed); } - else if (INTVAL (XEXP (x, 1)) >= -63 && INTVAL (XEXP (x, 1)) <= 63) + else if (IN_RANGE (INTVAL (XEXP (x, 1)), -63, 63)) *total = COSTS_N_INSNS (2); else *total = COSTS_N_INSNS (3); @@ -10608,7 +10609,7 @@ avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, *total += avr_operand_rtx_cost (XEXP (x, 1), mode, code, 1, speed); } - else if (INTVAL (XEXP (x, 1)) >= -63 && INTVAL (XEXP (x, 1)) <= 63) + else if (IN_RANGE (INTVAL (XEXP (x, 1)), -63, 63)) *total = COSTS_N_INSNS (1); else *total = COSTS_N_INSNS (4); @@ -11323,8 +11324,7 @@ static bool avr_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno, int *total, bool speed) { - bool done = avr_rtx_costs_1 (x, mode, outer_code, - opno, total, speed); + bool done = avr_rtx_costs_1 (x, mode, outer_code, opno, total, speed); if (avr_log.rtx_costs) { @@ -11658,7 +11658,7 @@ avr_reorg (void) { rtx x = XEXP (pattern, 0); rtx src = SET_SRC (pat); - rtx t = XEXP (src,0); + rtx t = XEXP (src, 0); PUT_CODE (t, swap_condition (GET_CODE (t))); XEXP (pattern, 0) = XEXP (pattern, 1); XEXP (pattern, 1) = x; @@ -11669,7 +11669,7 @@ avr_reorg (void) { /* This is a tst insn, we can reverse it. */ rtx src = SET_SRC (pat); - rtx t = XEXP (src,0); + rtx t = XEXP (src, 0); PUT_CODE (t, swap_condition (GET_CODE (t))); XEXP (pattern, 1) = XEXP (pattern, 0); @@ -11682,7 +11682,7 @@ avr_reorg (void) { rtx x = XEXP (pattern, 1); rtx src = SET_SRC (pat); - rtx t = XEXP (src,0); + rtx t = XEXP (src, 0); machine_mode mode = GET_MODE (XEXP (pattern, 0)); if (avr_simplify_comparison_p (mode, GET_CODE (t), x)) @@ -11889,8 +11889,8 @@ avr_hard_regno_call_part_clobbered (unsigned regno, machine_mode mode) /* Return true if any of the following boundaries is crossed: 17/18 or 19/20 (if AVR_TINY), 27/28 and 29/30. */ - return ((regno <= LAST_CALLEE_SAVED_REG && - regno + GET_MODE_SIZE (mode) > (LAST_CALLEE_SAVED_REG + 1)) + return ((regno <= LAST_CALLEE_SAVED_REG + && regno + GET_MODE_SIZE (mode) > 1 + LAST_CALLEE_SAVED_REG) || (regno < REG_Y && regno + GET_MODE_SIZE (mode) > REG_Y) || (regno < REG_Z && regno + GET_MODE_SIZE (mode) > REG_Z)); } @@ -12309,7 +12309,7 @@ avr_output_addr_vec_elt (FILE *stream, int value) } static void -avr_conditional_register_usage(void) +avr_conditional_register_usage (void) { if (AVR_TINY) { @@ -13191,13 +13191,13 @@ avr_expand_delay_cycles (rtx operands0) while (cycles >= 2) { - emit_insn (gen_nopv (GEN_INT(2))); + emit_insn (gen_nopv (GEN_INT (2))); cycles -= 2; } if (cycles == 1) { - emit_insn (gen_nopv (GEN_INT(1))); + emit_insn (gen_nopv (GEN_INT (1))); cycles--; } } @@ -13807,7 +13807,7 @@ avr_default_expand_builtin (enum insn_code icode, tree exp, rtx target) tree arg = CALL_EXPR_ARG (exp, n); rtx op = expand_expr (arg, NULL_RTX, VOIDmode, EXPAND_NORMAL); machine_mode opmode = GET_MODE (op); - machine_mode mode = insn_data[icode].operand[n+1].mode; + machine_mode mode = insn_data[icode].operand[n + 1].mode; if ((opmode == SImode || opmode == VOIDmode) && mode == HImode) { @@ -13820,7 +13820,7 @@ avr_default_expand_builtin (enum insn_code icode, tree exp, rtx target) gcc_assert (opmode == mode || opmode == VOIDmode); - if (!insn_data[icode].operand[n+1].predicate (op, mode)) + if (!insn_data[icode].operand[n + 1].predicate (op, mode)) op = copy_to_mode_reg (mode, op); xop[n] = op; @@ -13870,7 +13870,7 @@ avr_expand_builtin (tree exp, rtx target, switch (id) { case AVR_BUILTIN_NOP: - emit_insn (gen_nopv (GEN_INT(1))); + emit_insn (gen_nopv (GEN_INT (1))); return 0; case AVR_BUILTIN_DELAY_CYCLES: -- cgit v1.1 From a2863bde755d39626ee25e3b7a8875e0d93d7217 Mon Sep 17 00:00:00 2001 From: Ville Voutilainen Date: Thu, 1 Dec 2016 18:23:21 +0200 Subject: Implement LWG 2766, Swapping non-swappable types and LWG 2749, swappable traits for variants. * include/bits/move.h (swap(_Tp&, _Tp&)): Constrain with __is_tuple_like. * include/bits/stl_pair.h (swap(pair<_T1, _T2>&, pair<_T1, _T2>&)): Add a deleted overload. * include/bits/unique_ptr.h (swap(unique_ptr<_Tp, _Dp>&, unique_ptr<_Tp, _Dp>&)): Likewise. * include/std/array (swap(array<_Tp, _Nm>&, array<_Tp, _Nm>&)): Likewise. * include/std/optional (swap(optional<_Tp>&, optional<_Tp>&)): Likewise. * include/std/tuple (__is_tuple_like_impl, __is_tuple_like): Move to type_traits. (swap(tuple<_Elements...>&, tuple<_Elements...>&)): Add a deleted overload. * include/std/type_traits (__is_tuple_like_impl, __is_tuple_like): New. (swap(_Tp&, _Tp&)): Constrain with __is_tuple_like. * include/std/utility (__is_tuple_like_impl): Move to type_traits. * include/std/variant (swap(variant<_Types...>&, variant<_Types...>&)): Add a deleted overload. * testsuite/20_util/optional/swap/2.cc: Add tests for disabled swaps. * testsuite/20_util/pair/swap_cxx17.cc: New. * testsuite/20_util/tuple/swap_cxx17.cc: Likewise. * testsuite/20_util/unique_ptr/specialized_algorithms/swap_cxx17.cc: Likewise. * testsuite/20_util/variant/compile.cc: Add tests for disabled swaps. * testsuite/23_containers/array/specialized_algorithms/swap_cxx17.cc: New. * testsuite/23_containers/array/tuple_interface/get_neg.cc: Adjust. * testsuite/23_containers/array/tuple_interface/tuple_element_neg.cc: Likewise. From-SVN: r243120 --- libstdc++-v3/ChangeLog | 40 ++++++++++++++++++++ libstdc++-v3/include/bits/move.h | 3 +- libstdc++-v3/include/bits/stl_pair.h | 8 ++++ libstdc++-v3/include/bits/unique_ptr.h | 8 ++++ libstdc++-v3/include/std/array | 8 ++++ libstdc++-v3/include/std/optional | 4 ++ libstdc++-v3/include/std/tuple | 18 ++++----- libstdc++-v3/include/std/type_traits | 21 ++++++++++- libstdc++-v3/include/std/utility | 4 -- libstdc++-v3/include/std/variant | 11 +++++- libstdc++-v3/testsuite/20_util/optional/swap/2.cc | 4 +- libstdc++-v3/testsuite/20_util/pair/swap_cxx17.cc | 35 ++++++++++++++++++ libstdc++-v3/testsuite/20_util/tuple/swap_cxx17.cc | 43 ++++++++++++++++++++++ .../specialized_algorithms/swap_cxx17.cc | 33 +++++++++++++++++ libstdc++-v3/testsuite/20_util/variant/compile.cc | 15 ++++++++ .../array/specialized_algorithms/swap_cxx17.cc | 33 +++++++++++++++++ .../23_containers/array/tuple_interface/get_neg.cc | 4 +- .../array/tuple_interface/tuple_element_neg.cc | 2 +- 18 files changed, 270 insertions(+), 24 deletions(-) create mode 100644 libstdc++-v3/testsuite/20_util/pair/swap_cxx17.cc create mode 100644 libstdc++-v3/testsuite/20_util/tuple/swap_cxx17.cc create mode 100644 libstdc++-v3/testsuite/20_util/unique_ptr/specialized_algorithms/swap_cxx17.cc create mode 100644 libstdc++-v3/testsuite/23_containers/array/specialized_algorithms/swap_cxx17.cc diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index cc0b3ae..21404f16 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,5 +1,45 @@ 2016-12-01 Ville Voutilainen + Implement LWG 2766, + Swapping non-swappable types and LWG 2749, + swappable traits for variants. + * include/bits/move.h (swap(_Tp&, _Tp&)): Constrain + with __is_tuple_like. + * include/bits/stl_pair.h (swap(pair<_T1, _T2>&, pair<_T1, _T2>&)): + Add a deleted overload. + * include/bits/unique_ptr.h + (swap(unique_ptr<_Tp, _Dp>&, unique_ptr<_Tp, _Dp>&)): Likewise. + * include/std/array + (swap(array<_Tp, _Nm>&, array<_Tp, _Nm>&)): Likewise. + * include/std/optional + (swap(optional<_Tp>&, optional<_Tp>&)): Likewise. + * include/std/tuple (__is_tuple_like_impl, __is_tuple_like): + Move to type_traits. + (swap(tuple<_Elements...>&, tuple<_Elements...>&)): Add a deleted + overload. + * include/std/type_traits (__is_tuple_like_impl, __is_tuple_like): + New. + (swap(_Tp&, _Tp&)): Constrain with __is_tuple_like. + * include/std/utility (__is_tuple_like_impl): Move to type_traits. + * include/std/variant + (swap(variant<_Types...>&, variant<_Types...>&)): + Add a deleted overload. + * testsuite/20_util/optional/swap/2.cc: Add tests for disabled + swaps. + * testsuite/20_util/pair/swap_cxx17.cc: New. + * testsuite/20_util/tuple/swap_cxx17.cc: Likewise. + * testsuite/20_util/unique_ptr/specialized_algorithms/swap_cxx17.cc: + Likewise. + * testsuite/20_util/variant/compile.cc: Add tests for disabled + swaps. + * testsuite/23_containers/array/specialized_algorithms/swap_cxx17.cc: + New. + * testsuite/23_containers/array/tuple_interface/get_neg.cc: Adjust. + * testsuite/23_containers/array/tuple_interface/tuple_element_neg.cc: + Likewise. + +2016-12-01 Ville Voutilainen + The convertible_to traits need to use a variadic catch-all for the false-cases. * include/std/istream (__is_convertible_to_basic_istream): diff --git a/libstdc++-v3/include/bits/move.h b/libstdc++-v3/include/bits/move.h index d0aefe7..0bd11d6 100644 --- a/libstdc++-v3/include/bits/move.h +++ b/libstdc++-v3/include/bits/move.h @@ -181,7 +181,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template inline #if __cplusplus >= 201103L - typename enable_if<__and_, + typename enable_if<__and_<__not_<__is_tuple_like<_Tp>>, + is_move_constructible<_Tp>, is_move_assignable<_Tp>>::value>::type swap(_Tp& __a, _Tp& __b) noexcept(__and_, diff --git a/libstdc++-v3/include/bits/stl_pair.h b/libstdc++-v3/include/bits/stl_pair.h index ef52538..981dbeb 100644 --- a/libstdc++-v3/include/bits/stl_pair.h +++ b/libstdc++-v3/include/bits/stl_pair.h @@ -478,6 +478,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION swap(pair<_T1, _T2>& __x, pair<_T1, _T2>& __y) noexcept(noexcept(__x.swap(__y))) { __x.swap(__y); } + +#if __cplusplus > 201402L || !defined(__STRICT_ANSI__) // c++1z or gnu++11 + template + inline + typename enable_if, + __is_swappable<_T2>>::value>::type + swap(pair<_T1, _T2>&, pair<_T1, _T2>&) = delete; +#endif #endif // __cplusplus >= 201103L /** diff --git a/libstdc++-v3/include/bits/unique_ptr.h b/libstdc++-v3/include/bits/unique_ptr.h index f9ec60f..03f9bfc 100644 --- a/libstdc++-v3/include/bits/unique_ptr.h +++ b/libstdc++-v3/include/bits/unique_ptr.h @@ -650,6 +650,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION unique_ptr<_Tp, _Dp>& __y) noexcept { __x.swap(__y); } +#if __cplusplus > 201402L || !defined(__STRICT_ANSI__) // c++1z or gnu++11 + template + inline + typename enable_if::value>::type + swap(unique_ptr<_Tp, _Dp>&, + unique_ptr<_Tp, _Dp>&) = delete; +#endif + template inline bool diff --git a/libstdc++-v3/include/std/array b/libstdc++-v3/include/std/array index 3ab0355..fa7bac6 100644 --- a/libstdc++-v3/include/std/array +++ b/libstdc++-v3/include/std/array @@ -288,6 +288,14 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER noexcept(noexcept(__one.swap(__two))) { __one.swap(__two); } +#if __cplusplus > 201402L || !defined(__STRICT_ANSI__) // c++1z or gnu++11 + template + inline + typename enable_if< + !_GLIBCXX_STD_C::__array_traits<_Tp, _Nm>::_Is_swappable::value>::type + swap(array<_Tp, _Nm>&, array<_Tp, _Nm>&) = delete; +#endif + template constexpr _Tp& get(array<_Tp, _Nm>& __arr) noexcept diff --git a/libstdc++-v3/include/std/optional b/libstdc++-v3/include/std/optional index ea673cc..191d64b 100644 --- a/libstdc++-v3/include/std/optional +++ b/libstdc++-v3/include/std/optional @@ -930,6 +930,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { __lhs.swap(__rhs); } template + inline enable_if_t && is_swappable_v<_Tp>)> + swap(optional<_Tp>&, optional<_Tp>&) = delete; + + template constexpr optional> make_optional(_Tp&& __t) { return optional> { std::forward<_Tp>(__t) }; } diff --git a/libstdc++-v3/include/std/tuple b/libstdc++-v3/include/std/tuple index 63cacd4..fb2fd17 100644 --- a/libstdc++-v3/include/std/tuple +++ b/libstdc++-v3/include/std/tuple @@ -1442,17 +1442,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION forward_as_tuple(_Elements&&... __args) noexcept { return tuple<_Elements&&...>(std::forward<_Elements>(__args)...); } - template - struct __is_tuple_like_impl> : true_type - { }; - - // Internal type trait that allows us to sfinae-protect tuple_cat. - template - struct __is_tuple_like - : public __is_tuple_like_impl::type>::type>::type - { }; - template struct __make_tuple_impl; @@ -1597,6 +1586,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION noexcept(noexcept(__x.swap(__y))) { __x.swap(__y); } +#if __cplusplus > 201402L || !defined(__STRICT_ANSI__) // c++1z or gnu++11 + template + inline + typename enable_if...>::value>::type + swap(tuple<_Elements...>&, tuple<_Elements...>&) = delete; +#endif + // A class (and instance) which can be used in 'tie' when an element // of a tuple is not required struct _Swallow_assign diff --git a/libstdc++-v3/include/std/type_traits b/libstdc++-v3/include/std/type_traits index e5f2bba..f164f71 100644 --- a/libstdc++-v3/include/std/type_traits +++ b/libstdc++-v3/include/std/type_traits @@ -2593,9 +2593,28 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION template struct __is_nothrow_swappable; + template + class tuple; + + template + struct __is_tuple_like_impl : false_type + { }; + + template + struct __is_tuple_like_impl> : true_type + { }; + + // Internal type trait that allows us to sfinae-protect tuple_cat. + template + struct __is_tuple_like + : public __is_tuple_like_impl::type>::type>::type + { }; + template inline - typename enable_if<__and_, + typename enable_if<__and_<__not_<__is_tuple_like<_Tp>>, + is_move_constructible<_Tp>, is_move_assignable<_Tp>>::value>::type swap(_Tp&, _Tp&) noexcept(__and_, diff --git a/libstdc++-v3/include/std/utility b/libstdc++-v3/include/std/utility index 3982156..8e02f0e 100644 --- a/libstdc++-v3/include/std/utility +++ b/libstdc++-v3/include/std/utility @@ -140,10 +140,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION using tuple_element_t = typename tuple_element<__i, _Tp>::type; #endif - template - struct __is_tuple_like_impl : false_type - { }; - // Various functions which give std::pair a tuple-like interface. /// Partial specialization for std::pair diff --git a/libstdc++-v3/include/std/variant b/libstdc++-v3/include/std/variant index 34ad3fd..89ca979 100644 --- a/libstdc++-v3/include/std/variant +++ b/libstdc++-v3/include/std/variant @@ -889,10 +889,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { return false; } template - inline auto swap(variant<_Types...>& __lhs, variant<_Types...>& __rhs) - noexcept(noexcept(__lhs.swap(__rhs))) -> decltype(__lhs.swap(__rhs)) + inline enable_if_t<__and_..., + is_swappable<_Types>...>::value> + swap(variant<_Types...>& __lhs, variant<_Types...>& __rhs) + noexcept(noexcept(__lhs.swap(__rhs))) { __lhs.swap(__rhs); } + template + inline enable_if_t..., + is_swappable<_Types>...>::value> + swap(variant<_Types...>&, variant<_Types...>&) = delete; + class bad_variant_access : public exception { public: diff --git a/libstdc++-v3/testsuite/20_util/optional/swap/2.cc b/libstdc++-v3/testsuite/20_util/optional/swap/2.cc index 5793488..cb9291a 100644 --- a/libstdc++-v3/testsuite/20_util/optional/swap/2.cc +++ b/libstdc++-v3/testsuite/20_util/optional/swap/2.cc @@ -33,11 +33,11 @@ void swap(B&, B&) noexcept(false); static_assert( std::is_swappable_v> ); static_assert( !std::is_nothrow_swappable_v> ); -// Not swappable, but optional is swappable via the generic std::swap. +// Not swappable, and optional not swappable via the generic std::swap. struct C { }; void swap(C&, C&) = delete; -static_assert( std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); // Not swappable, and optional not swappable via the generic std::swap. struct D { D(D&&) = delete; }; diff --git a/libstdc++-v3/testsuite/20_util/pair/swap_cxx17.cc b/libstdc++-v3/testsuite/20_util/pair/swap_cxx17.cc new file mode 100644 index 0000000..6b09f42 --- /dev/null +++ b/libstdc++-v3/testsuite/20_util/pair/swap_cxx17.cc @@ -0,0 +1,35 @@ +// { dg-options "-std=gnu++17" } +// { dg-do compile } + +// Copyright (C) 2016 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + + +#include + +// Not swappable, and pair not swappable via the generic std::swap. +struct C { }; +void swap(C&, C&) = delete; + +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); + +// Not swappable, and pair not swappable via the generic std::swap. +struct D { D(D&&) = delete; }; + +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); diff --git a/libstdc++-v3/testsuite/20_util/tuple/swap_cxx17.cc b/libstdc++-v3/testsuite/20_util/tuple/swap_cxx17.cc new file mode 100644 index 0000000..d2a75ce --- /dev/null +++ b/libstdc++-v3/testsuite/20_util/tuple/swap_cxx17.cc @@ -0,0 +1,43 @@ +// { dg-options "-std=gnu++17" } +// { dg-do compile } + +// Copyright (C) 2016 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + + +// NOTE: This makes use of the fact that we know how moveable +// is implemented on tuple. If the implementation changed +// this test may begin to fail. + +#include + +// Not swappable, and tuple not swappable via the generic std::swap. +struct C { }; +void swap(C&, C&) = delete; + +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); + +// Not swappable, and tuple not swappable via the generic std::swap. +struct D { D(D&&) = delete; }; + +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); diff --git a/libstdc++-v3/testsuite/20_util/unique_ptr/specialized_algorithms/swap_cxx17.cc b/libstdc++-v3/testsuite/20_util/unique_ptr/specialized_algorithms/swap_cxx17.cc new file mode 100644 index 0000000..bf106ec --- /dev/null +++ b/libstdc++-v3/testsuite/20_util/unique_ptr/specialized_algorithms/swap_cxx17.cc @@ -0,0 +1,33 @@ +// { dg-options "-std=gnu++17" } +// { dg-do compile } + +// Copyright (C) 2016 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +#include + +// Not swappable, and unique_ptr not swappable via the generic std::swap. +struct C { }; +void swap(C&, C&) = delete; + +static_assert( !std::is_swappable_v> ); + +// Not swappable, and unique_ptr not swappable via the generic std::swap. +struct D { D(D&&) = delete; }; + +static_assert( !std::is_swappable_v> ); + diff --git a/libstdc++-v3/testsuite/20_util/variant/compile.cc b/libstdc++-v3/testsuite/20_util/variant/compile.cc index e3330be..8250a95 100644 --- a/libstdc++-v3/testsuite/20_util/variant/compile.cc +++ b/libstdc++-v3/testsuite/20_util/variant/compile.cc @@ -219,6 +219,21 @@ void test_relational() } } +// Not swappable, and variant not swappable via the generic std::swap. +struct C { }; +void swap(C&, C&) = delete; + +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); + +// Not swappable, and variant not swappable via the generic std::swap. +struct D { D(D&&) = delete; }; + +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); +static_assert( !std::is_swappable_v> ); + void test_swap() { variant a, b; diff --git a/libstdc++-v3/testsuite/23_containers/array/specialized_algorithms/swap_cxx17.cc b/libstdc++-v3/testsuite/23_containers/array/specialized_algorithms/swap_cxx17.cc new file mode 100644 index 0000000..2e93c4d --- /dev/null +++ b/libstdc++-v3/testsuite/23_containers/array/specialized_algorithms/swap_cxx17.cc @@ -0,0 +1,33 @@ +// { dg-options "-std=gnu++17" } +// { dg-do compile } + +// Copyright (C) 2016 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +#include + +// Not swappable, and pair not swappable via the generic std::swap. +struct C { }; +void swap(C&, C&) = delete; + +static_assert( !std::is_swappable_v> ); + +// Not swappable, and pair not swappable via the generic std::swap. +struct D { D(D&&) = delete; }; + +static_assert( !std::is_swappable_v> ); + diff --git a/libstdc++-v3/testsuite/23_containers/array/tuple_interface/get_neg.cc b/libstdc++-v3/testsuite/23_containers/array/tuple_interface/get_neg.cc index f99bbf6..568ec85 100644 --- a/libstdc++-v3/testsuite/23_containers/array/tuple_interface/get_neg.cc +++ b/libstdc++-v3/testsuite/23_containers/array/tuple_interface/get_neg.cc @@ -27,6 +27,6 @@ int n1 = std::get<1>(a); int n2 = std::get<1>(std::move(a)); int n3 = std::get<1>(ca); -// { dg-error "static assertion failed" "" { target *-*-* } 295 } -// { dg-error "static assertion failed" "" { target *-*-* } 304 } +// { dg-error "static assertion failed" "" { target *-*-* } 303 } // { dg-error "static assertion failed" "" { target *-*-* } 312 } +// { dg-error "static assertion failed" "" { target *-*-* } 320 } diff --git a/libstdc++-v3/testsuite/23_containers/array/tuple_interface/tuple_element_neg.cc b/libstdc++-v3/testsuite/23_containers/array/tuple_interface/tuple_element_neg.cc index 1941f3c..32cb10b 100644 --- a/libstdc++-v3/testsuite/23_containers/array/tuple_interface/tuple_element_neg.cc +++ b/libstdc++-v3/testsuite/23_containers/array/tuple_interface/tuple_element_neg.cc @@ -22,4 +22,4 @@ typedef std::tuple_element<1, std::array>::type type; -// { dg-error "static assertion failed" "" { target *-*-* } 343 } +// { dg-error "static assertion failed" "" { target *-*-* } 351 } -- cgit v1.1 From 1f0133ebb9e9209f0fd8e08ddbab2e9a117d1d1e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 1 Dec 2016 18:10:58 +0100 Subject: i386.md (*andndi3_doubleword): Depend on TARGET_SSE2. * config/i386/i386.md (*andndi3_doubleword): Depend on TARGET_SSE2. From-SVN: r243121 --- gcc/ChangeLog | 4 ++++ gcc/config/i386/i386.md | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5b2570b..6768c5f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,7 @@ +2016-12-01 Uros Bizjak + + * config/i386/i386.md (*andndi3_doubleword): Depend on TARGET_SSE2. + 2016-12-01 Georg-Johann Lay * config/avr/avr.c: Fix coding rule glitches. diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index ed525b9..583d2bb 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -8540,7 +8540,7 @@ (not:DI (match_operand:DI 1 "register_operand" "r")) (match_operand:DI 2 "nonimmediate_operand" "rm"))) (clobber (reg:CC FLAGS_REG))] - "TARGET_BMI && !TARGET_64BIT && TARGET_STV && TARGET_SSE" + "TARGET_BMI && !TARGET_64BIT && TARGET_STV && TARGET_SSE2" "#" "&& reload_completed" [(parallel [(set (match_dup 0) @@ -8575,7 +8575,7 @@ (set_attr "btver2_decode" "direct") (set_attr "mode" "SI")]) -(define_insn "*bmi_andn__ccno" +(define_insn "*andn__ccno" [(set (reg FLAGS_REG) (compare (and:SWI48 -- cgit v1.1 From 3c7089946936a3e420f0e5db83212f15e0a7027a Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Thu, 1 Dec 2016 18:11:56 +0000 Subject: * g++.dg/tls/pr77285-1.C: dg-add-options tls From-SVN: r243124 --- gcc/testsuite/ChangeLog | 4 ++++ gcc/testsuite/g++.dg/tls/pr77285-1.C | 1 + 2 files changed, 5 insertions(+) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 2a04091..d6e5ac4 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2016-12-01 David Edelsohn + + * g++.dg/tls/pr77285-1.C: dg-add-options tls + 2016-12-01 James Greenhalgh * gcc.dg/pr78582.c (main): Call setjmp, not sigsetjmp. diff --git a/gcc/testsuite/g++.dg/tls/pr77285-1.C b/gcc/testsuite/g++.dg/tls/pr77285-1.C index d8f69b2..7a93414 100644 --- a/gcc/testsuite/g++.dg/tls/pr77285-1.C +++ b/gcc/testsuite/g++.dg/tls/pr77285-1.C @@ -1,5 +1,6 @@ // { dg-do link { target c++11 } } // { dg-require-effective-target tls } +// { dg-add-options tls } // { dg-additional-sources pr77285-2.C } struct __attribute__((abi_tag("tag"))) X { ~X () {} int i = 0; }; -- cgit v1.1 From 1281fc99115392eb3f19f5e0a5c9b604fc72b27a Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Thu, 1 Dec 2016 18:18:30 +0000 Subject: vec.h (vec Prathamesh Kulkarni * vec.h (vec::quick_grow_cleared): Guard call to memset if len-oldlen != 0. (vec::safe_grow_cleared): Likewise. Co-Authored-By: Prathamesh Kulkarni From-SVN: r243125 --- gcc/ChangeLog | 7 +++++++ gcc/vec.h | 8 ++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6768c5f..b567324 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2016-12-01 Richard Biener + Prathamesh Kulkarni + + * vec.h (vec::quick_grow_cleared): Guard call to + memset if len-oldlen != 0. + (vec::safe_grow_cleared): Likewise. + 2016-12-01 Uros Bizjak * config/i386/i386.md (*andndi3_doubleword): Depend on TARGET_SSE2. diff --git a/gcc/vec.h b/gcc/vec.h index 14fb2a6..aa93411 100644 --- a/gcc/vec.h +++ b/gcc/vec.h @@ -1092,8 +1092,10 @@ inline void vec::quick_grow_cleared (unsigned len) { unsigned oldlen = length (); + size_t sz = sizeof (T) * (len - oldlen); quick_grow (len); - memset (&(address ()[oldlen]), 0, sizeof (T) * (len - oldlen)); + if (sz != 0) + memset (&(address ()[oldlen]), 0, sz); } @@ -1605,8 +1607,10 @@ inline void vec::safe_grow_cleared (unsigned len MEM_STAT_DECL) { unsigned oldlen = length (); + size_t sz = sizeof (T) * (len - oldlen); safe_grow (len PASS_MEM_STAT); - memset (&(address ()[oldlen]), 0, sizeof (T) * (len - oldlen)); + if (sz != 0) + memset (&(address ()[oldlen]), 0, sz); } -- cgit v1.1 From d8a2f02ec6c25fb050e96048eb19ae218d79e74b Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Thu, 1 Dec 2016 18:58:47 +0000 Subject: re PR debug/66149 (ICE: tree check: expected field_decl, have template_decl in int_bit_position, at tree.h:5012 with -std=c++14 -gstabs) PR debug/66419 PR c++/78235 * dbxout.c (dbxout_type_fields): Skip TEMPLATE_DECLs. From-SVN: r243126 --- gcc/ChangeLog | 6 ++++++ gcc/dbxout.c | 1 + 2 files changed, 7 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b567324..c70394e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2016-12-01 David Edelsohn + + PR debug/66419 + PR c++/78235 + * dbxout.c (dbxout_type_fields): Skip TEMPLATE_DECLs. + 2016-12-01 Richard Biener Prathamesh Kulkarni diff --git a/gcc/dbxout.c b/gcc/dbxout.c index 3301417..658cc3d 100644 --- a/gcc/dbxout.c +++ b/gcc/dbxout.c @@ -1479,6 +1479,7 @@ dbxout_type_fields (tree type) /* Omit here local type decls until we know how to support them. */ if (TREE_CODE (tem) == TYPE_DECL + || TREE_CODE (tem) == TEMPLATE_DECL /* Omit here the nameless fields that are used to skip bits. */ || DECL_IGNORED_P (tem) /* Omit fields whose position or size are variable or too large to -- cgit v1.1 From 0269650d4a53dfdde90e90db701ee6233a61f837 Mon Sep 17 00:00:00 2001 From: David Edelsohn Date: Thu, 1 Dec 2016 19:02:34 +0000 Subject: * testsuite/26_numerics/headers/cmath/hypot.cc: XFAIL on AIX. From-SVN: r243127 --- libstdc++-v3/ChangeLog | 4 ++++ libstdc++-v3/testsuite/26_numerics/headers/cmath/hypot.cc | 1 + 2 files changed, 5 insertions(+) diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 21404f16..08d9229 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,7 @@ +2016-12-01 David Edelsohn + + * testsuite/26_numerics/headers/cmath/hypot.cc: XFAIL on AIX. + 2016-12-01 Ville Voutilainen Implement LWG 2766, diff --git a/libstdc++-v3/testsuite/26_numerics/headers/cmath/hypot.cc b/libstdc++-v3/testsuite/26_numerics/headers/cmath/hypot.cc index ad9e77e..b4df3ba 100644 --- a/libstdc++-v3/testsuite/26_numerics/headers/cmath/hypot.cc +++ b/libstdc++-v3/testsuite/26_numerics/headers/cmath/hypot.cc @@ -17,6 +17,7 @@ // { dg-options "-std=gnu++17" } // { dg-do run { target c++1z } } +// { dg-xfail-run-if "AIX long double" { powerpc-ibm-aix* } } #include #include -- cgit v1.1 From 98934fac3bd8fd149387164ac3ee97795a5e6825 Mon Sep 17 00:00:00 2001 From: Ian Lance Taylor Date: Thu, 1 Dec 2016 19:54:36 +0000 Subject: compiler: add slice initializers to the GC root list As of https://golang.org/cl/32917 we can put slice initializers in the .data section. The program can still change the values in those slices. That means that if the slice elements can contain pointers, we need to register the entire initializer as a GC root. This would be straightforward except that we only have a Bexpression for the slice initializer, not an Expression. So introduce a Backend_expression type that wraps a Bexpression as an Expression. The test case for this is https://golang.org/cl/33790. Reviewed-on: https://go-review.googlesource.com/33792 From-SVN: r243129 --- gcc/go/gofrontend/MERGE | 2 +- gcc/go/gofrontend/expressions.cc | 36 ++++++++++++++++++++++++++ gcc/go/gofrontend/expressions.h | 55 +++++++++++++++++++++++++++++++++++++++- gcc/go/gofrontend/gogo.cc | 28 +++++++++++++++++--- gcc/go/gofrontend/gogo.h | 12 +++++++++ 5 files changed, 128 insertions(+), 5 deletions(-) diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE index 8f5f542..5529002 100644 --- a/gcc/go/gofrontend/MERGE +++ b/gcc/go/gofrontend/MERGE @@ -1,4 +1,4 @@ -97b949f249515a61d3c09e9e06f08c8af189e967 +b7bad96ce0af50a1129eaab9aa110d68a601917b The first line of this file holds the git revision number of the last merge done from the gofrontend repository. diff --git a/gcc/go/gofrontend/expressions.cc b/gcc/go/gofrontend/expressions.cc index 0ab6726..9740d32 100644 --- a/gcc/go/gofrontend/expressions.cc +++ b/gcc/go/gofrontend/expressions.cc @@ -4295,6 +4295,20 @@ Unary_expression::do_get_backend(Translate_context* context) true, copy_to_heap, false, bexpr); bexpr = gogo->backend()->var_expression(implicit, loc); + + // If we are not copying a slice initializer to the heap, + // then it can be changed by the program, so if it can + // contain pointers we must register it as a GC root. + if (this->is_slice_init_ + && !copy_to_heap + && this->expr_->type()->has_pointer()) + { + Bexpression* root = + gogo->backend()->var_expression(implicit, loc); + root = gogo->backend()->address_expression(root, loc); + Type* type = Type::make_pointer_type(this->expr_->type()); + gogo->add_gc_root(Expression::make_backend(root, type, loc)); + } } else if ((this->expr_->is_composite_literal() || this->expr_->string_expression() != NULL) @@ -15433,6 +15447,28 @@ Expression::make_compound(Expression* init, Expression* expr, Location location) return new Compound_expression(init, expr, location); } +// Class Backend_expression. + +int +Backend_expression::do_traverse(Traverse*) +{ + return TRAVERSE_CONTINUE; +} + +void +Backend_expression::do_dump_expression(Ast_dump_context* ast_dump_context) const +{ + ast_dump_context->ostream() << "backend_expression<"; + ast_dump_context->dump_type(this->type_); + ast_dump_context->ostream() << ">"; +} + +Expression* +Expression::make_backend(Bexpression* bexpr, Type* type, Location location) +{ + return new Backend_expression(bexpr, type, location); +} + // Import an expression. This comes at the end in order to see the // various class definitions. diff --git a/gcc/go/gofrontend/expressions.h b/gcc/go/gofrontend/expressions.h index 96d314f..f31d4a6 100644 --- a/gcc/go/gofrontend/expressions.h +++ b/gcc/go/gofrontend/expressions.h @@ -137,7 +137,8 @@ class Expression EXPRESSION_STRUCT_FIELD_OFFSET, EXPRESSION_LABEL_ADDR, EXPRESSION_CONDITIONAL, - EXPRESSION_COMPOUND + EXPRESSION_COMPOUND, + EXPRESSION_BACKEND }; Expression(Expression_classification, Location); @@ -485,6 +486,10 @@ class Expression static Expression* make_compound(Expression*, Expression*, Location); + // Make a backend expression. + static Expression* + make_backend(Bexpression*, Type*, Location); + // Return the expression classification. Expression_classification classification() const @@ -3825,6 +3830,54 @@ class Compound_expression : public Expression Expression* expr_; }; +// A backend expression. This is a backend expression wrapped in an +// Expression, for convenience during backend generation. + +class Backend_expression : public Expression +{ + public: + Backend_expression(Bexpression* bexpr, Type* type, Location location) + : Expression(EXPRESSION_BACKEND, location), bexpr_(bexpr), type_(type) + {} + + protected: + int + do_traverse(Traverse*); + + // For now these are always valid static initializers. If that + // changes we can change this. + bool + do_is_static_initializer() const + { return true; } + + Type* + do_type() + { return this->type_; } + + void + do_determine_type(const Type_context*) + { } + + Expression* + do_copy() + { + return new Backend_expression(this->bexpr_, this->type_, this->location()); + } + + Bexpression* + do_get_backend(Translate_context*) + { return this->bexpr_; } + + void + do_dump_expression(Ast_dump_context*) const; + + private: + // The backend expression we are wrapping. + Bexpression* bexpr_; + // The type of the expression; + Type* type_; +}; + // A numeric constant. This is used both for untyped constants and // for constants that have a type. diff --git a/gcc/go/gofrontend/gogo.cc b/gcc/go/gofrontend/gogo.cc index b671ce5..d685bca 100644 --- a/gcc/go/gofrontend/gogo.cc +++ b/gcc/go/gofrontend/gogo.cc @@ -54,7 +54,9 @@ Gogo::Gogo(Backend* backend, Linemap* linemap, int, int pointer_size) interface_types_(), specific_type_functions_(), specific_type_functions_are_written_(false), - named_types_are_converted_(false) + named_types_are_converted_(false), + analysis_sets_(), + gc_roots_() { const Location loc = Linemap::predeclared_location(); @@ -750,10 +752,9 @@ Gogo::register_gc_vars(const std::vector& var_gc, Expression_list* roots_init = new Expression_list(); - size_t i = 0; for (std::vector::const_iterator p = var_gc.begin(); p != var_gc.end(); - ++p, ++i) + ++p) { Expression_list* init = new Expression_list(); @@ -772,6 +773,27 @@ Gogo::register_gc_vars(const std::vector& var_gc, roots_init->push_back(root_ctor); } + for (std::vector::const_iterator p = this->gc_roots_.begin(); + p != this->gc_roots_.end(); + ++p) + { + Expression_list *init = new Expression_list(); + + Expression* expr = *p; + Location eloc = expr->location(); + init->push_back(expr); + + Type* type = expr->type()->points_to(); + go_assert(type != NULL); + Expression* size = + Expression::make_type_info(type, Expression::TYPE_INFO_SIZE); + init->push_back(size); + + Expression* root_ctor = + Expression::make_struct_composite_literal(root_type, init, eloc); + roots_init->push_back(root_ctor); + } + // The list ends with a NULL entry. Expression_list* null_init = new Expression_list(); diff --git a/gcc/go/gofrontend/gogo.h b/gcc/go/gofrontend/gogo.h index 62bbf9e..7ddb3ce 100644 --- a/gcc/go/gofrontend/gogo.h +++ b/gcc/go/gofrontend/gogo.h @@ -19,6 +19,7 @@ class Typed_identifier; class Typed_identifier_list; class Function_type; class Expression; +class Expression_list; class Statement; class Temporary_statement; class Block; @@ -556,6 +557,15 @@ class Gogo specific_type_functions_are_written() const { return this->specific_type_functions_are_written_; } + // Add a pointer that needs to be added to the list of objects + // traversed by the garbage collector. This should be an expression + // of pointer type that points to static storage. It's not + // necessary to add global variables to this list, just global + // variable initializers that would otherwise not be seen. + void + add_gc_root(Expression* expr) + { this->gc_roots_.push_back(expr); } + // Traverse the tree. See the Traverse class. void traverse(Traverse*); @@ -892,6 +902,8 @@ class Gogo // A list containing groups of possibly mutually recursive functions to be // considered during escape analysis. std::vector analysis_sets_; + // A list of objects to add to the GC roots. + std::vector gc_roots_; }; // A block of statements. -- cgit v1.1 From 96ad5df6db65383330cba79ed823a0256e750033 Mon Sep 17 00:00:00 2001 From: "Steven G. Kargl" Date: Thu, 1 Dec 2016 20:37:55 +0000 Subject: re PR fortran/78279 (ICE in identical_array_ref, at fortran/dependency.c:104) 2016-12-01 Steven G. Kargl PR fortran/78279 * dependency.c (identical_array_ref): Convert gcc_assert to conditional and gfc_internal_error. 2016-12-01 Steven G. Kargl PR fortran/78279 * gfortran.dg/pr78279.f90: New test. From-SVN: r243131 --- gcc/fortran/ChangeLog | 6 ++++++ gcc/fortran/dependency.c | 4 +++- gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/gfortran.dg/pr78279.f90 | 10 ++++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gfortran.dg/pr78279.f90 diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index 278c08f..d410392 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,3 +1,9 @@ +2016-12-01 Steven G. Kargl + + PR fortran/78279 + * dependency.c (identical_array_ref): Convert gcc_assert to conditional + and gfc_internal_error. + 2016-11-30 Andre Vehreschild * check.c (gfc_check_allocated): By pass the caf_get call and check on diff --git a/gcc/fortran/dependency.c b/gcc/fortran/dependency.c index 82c5e6b..4a3c1a7 100644 --- a/gcc/fortran/dependency.c +++ b/gcc/fortran/dependency.c @@ -101,7 +101,9 @@ identical_array_ref (gfc_array_ref *a1, gfc_array_ref *a2) if (a1->type == AR_ELEMENT && a2->type == AR_ELEMENT) { - gcc_assert (a1->dimen == a2->dimen); + if (a1->dimen != a2->dimen) + gfc_internal_error ("identical_array_ref(): inconsistent dimensions"); + for (i = 0; i < a1->dimen; i++) { if (gfc_dep_compare_expr (a1->start[i], a2->start[i]) != 0) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index d6e5ac4..321a48a 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 Steven G. Kargl + + PR fortran/78279 + * gfortran.dg/pr78279.f90: New test. + 2016-12-01 David Edelsohn * g++.dg/tls/pr77285-1.C: dg-add-options tls diff --git a/gcc/testsuite/gfortran.dg/pr78279.f90 b/gcc/testsuite/gfortran.dg/pr78279.f90 new file mode 100644 index 0000000..cb01752 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/pr78279.f90 @@ -0,0 +1,10 @@ +! { dg-do compile } +! { dg-options "-Ofast" } +program p + integer :: i + real :: z(2,4) + z = 0.0 + do i = 1, 3 + if ( z(i) > z(1,i+1) ) print *, i ! { dg-error "mismatch in array reference" } + end do +end -- cgit v1.1 From f99bd883fb0d051ff2d7cebe217f2d2a8ad16bfd Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Thu, 1 Dec 2016 21:41:10 +0000 Subject: sparc.opt (mlra): New target option. * config/sparc/sparc.opt (mlra): New target option. * config/sparc/sparc.c (TARGET_LRA_P): Define to... (sparc_lra_p): ...this. New function. (D_MODES, DF_MODES): Add missing cast. * config/sparc/sparc.md (*movsi_lo_sum, *movsi_high): Do not provide these insns when flag_pic. (sethi_di_medlow, losum_di_medlow, seth44, setm44, setl44, sethh, setlm, sethm, setlo, embmedany_sethi, embmedany_losum, embmedany_brsum, embmedany_textuhi, embmedany_texthi, embmedany_textulo, embmedany_textlo): Likewise. (sethi_di_medlow_embmedany_pic): Provide it only when flag_pic. Co-Authored-By: David S. Miller From-SVN: r243135 --- gcc/ChangeLog | 15 +++++++++++++++ gcc/config/sparc/sparc.c | 15 ++++++++++++--- gcc/config/sparc/sparc.md | 46 +++++++++++++++++++++------------------------- gcc/config/sparc/sparc.opt | 4 ++++ 4 files changed, 52 insertions(+), 28 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c70394e..5a55f42 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,18 @@ +2016-12-01 Eric Botcazou + David S. Miller + + * config/sparc/sparc.opt (mlra): New target option. + * config/sparc/sparc.c (TARGET_LRA_P): Define to... + (sparc_lra_p): ...this. New function. + (D_MODES, DF_MODES): Add missing cast. + * config/sparc/sparc.md (*movsi_lo_sum, *movsi_high): Do not + provide these insns when flag_pic. + (sethi_di_medlow, losum_di_medlow, seth44, setm44, setl44, sethh, + setlm, sethm, setlo, embmedany_sethi, embmedany_losum, + embmedany_brsum, embmedany_textuhi, embmedany_texthi, + embmedany_textulo, embmedany_textlo): Likewise. + (sethi_di_medlow_embmedany_pic): Provide it only with flag_pic. + 2016-12-01 David Edelsohn PR debug/66419 diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c index a70a0ad..e17552a 100644 --- a/gcc/config/sparc/sparc.c +++ b/gcc/config/sparc/sparc.c @@ -639,6 +639,7 @@ static const char *sparc_mangle_type (const_tree); static void sparc_trampoline_init (rtx, tree, rtx); static machine_mode sparc_preferred_simd_mode (machine_mode); static reg_class_t sparc_preferred_reload_class (rtx x, reg_class_t rclass); +static bool sparc_lra_p (void); static bool sparc_print_operand_punct_valid_p (unsigned char); static void sparc_print_operand (FILE *, rtx, int); static void sparc_print_operand_address (FILE *, machine_mode, rtx); @@ -836,7 +837,7 @@ char sparc_hard_reg_printed[8]; #endif #undef TARGET_LRA_P -#define TARGET_LRA_P hook_bool_void_false +#define TARGET_LRA_P sparc_lra_p #undef TARGET_LEGITIMATE_ADDRESS_P #define TARGET_LEGITIMATE_ADDRESS_P sparc_legitimate_address_p @@ -4787,7 +4788,7 @@ enum sparc_mode_class { ((1 << (int) H_MODE) | (1 << (int) S_MODE) | (1 << (int) SF_MODE)) /* Modes for double-word and smaller quantities. */ -#define D_MODES (S_MODES | (1 << (int) D_MODE) | (1 << DF_MODE)) +#define D_MODES (S_MODES | (1 << (int) D_MODE) | (1 << (int) DF_MODE)) /* Modes for quad-word and smaller quantities. */ #define T_MODES (D_MODES | (1 << (int) T_MODE) | (1 << (int) TF_MODE)) @@ -4799,7 +4800,7 @@ enum sparc_mode_class { #define SF_MODES ((1 << (int) S_MODE) | (1 << (int) SF_MODE)) /* Modes for double-float and smaller quantities. */ -#define DF_MODES (SF_MODES | (1 << (int) D_MODE) | (1 << DF_MODE)) +#define DF_MODES (SF_MODES | (1 << (int) D_MODE) | (1 << (int) DF_MODE)) /* Modes for quad-float and smaller quantities. */ #define TF_MODES (DF_MODES | (1 << (int) TF_MODE)) @@ -12248,6 +12249,14 @@ sparc_preferred_reload_class (rtx x, reg_class_t rclass) return rclass; } +/* Return true if we use LRA instead of reload pass. */ + +static bool +sparc_lra_p (void) +{ + return TARGET_LRA; +} + /* Output a wide multiply instruction in V8+ mode. INSN is the instruction, OPERANDS are its operands and OPCODE is the mnemonic to be used. */ diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md index f33c391..896ce4b 100644 --- a/gcc/config/sparc/sparc.md +++ b/gcc/config/sparc/sparc.md @@ -1568,13 +1568,13 @@ [(set (match_operand:SI 0 "register_operand" "=r") (lo_sum:SI (match_operand:SI 1 "register_operand" "r") (match_operand:SI 2 "immediate_operand" "in")))] - "" + "!flag_pic" "or\t%1, %%lo(%a2), %0") (define_insn "*movsi_high" [(set (match_operand:SI 0 "register_operand" "=r") (high:SI (match_operand:SI 1 "immediate_operand" "in")))] - "" + "!flag_pic" "sethi\t%%hi(%a1), %0") ;; The next two patterns must wrap the SYMBOL_REF in an UNSPEC @@ -1846,27 +1846,27 @@ (define_insn "*sethi_di_medlow_embmedany_pic" [(set (match_operand:DI 0 "register_operand" "=r") (high:DI (match_operand:DI 1 "medium_pic_operand" "")))] - "(TARGET_CM_MEDLOW || TARGET_CM_EMBMEDANY) && check_pic (1)" + "(TARGET_CM_MEDLOW || TARGET_CM_EMBMEDANY) && flag_pic && check_pic (1)" "sethi\t%%hi(%a1), %0") (define_insn "*sethi_di_medlow" [(set (match_operand:DI 0 "register_operand" "=r") (high:DI (match_operand:DI 1 "symbolic_operand" "")))] - "TARGET_CM_MEDLOW && check_pic (1)" + "TARGET_CM_MEDLOW && !flag_pic" "sethi\t%%hi(%a1), %0") (define_insn "*losum_di_medlow" [(set (match_operand:DI 0 "register_operand" "=r") (lo_sum:DI (match_operand:DI 1 "register_operand" "r") (match_operand:DI 2 "symbolic_operand" "")))] - "TARGET_CM_MEDLOW" + "TARGET_CM_MEDLOW && !flag_pic" "or\t%1, %%lo(%a2), %0") (define_insn "seth44" [(set (match_operand:DI 0 "register_operand" "=r") (high:DI (unspec:DI [(match_operand:DI 1 "symbolic_operand" "")] UNSPEC_SETH44)))] - "TARGET_CM_MEDMID" + "TARGET_CM_MEDMID && !flag_pic" "sethi\t%%h44(%a1), %0") (define_insn "setm44" @@ -1874,28 +1874,28 @@ (lo_sum:DI (match_operand:DI 1 "register_operand" "r") (unspec:DI [(match_operand:DI 2 "symbolic_operand" "")] UNSPEC_SETM44)))] - "TARGET_CM_MEDMID" + "TARGET_CM_MEDMID && !flag_pic" "or\t%1, %%m44(%a2), %0") (define_insn "setl44" [(set (match_operand:DI 0 "register_operand" "=r") (lo_sum:DI (match_operand:DI 1 "register_operand" "r") (match_operand:DI 2 "symbolic_operand" "")))] - "TARGET_CM_MEDMID" + "TARGET_CM_MEDMID && !flag_pic" "or\t%1, %%l44(%a2), %0") (define_insn "sethh" [(set (match_operand:DI 0 "register_operand" "=r") (high:DI (unspec:DI [(match_operand:DI 1 "symbolic_operand" "")] UNSPEC_SETHH)))] - "TARGET_CM_MEDANY" + "TARGET_CM_MEDANY && !flag_pic" "sethi\t%%hh(%a1), %0") (define_insn "setlm" [(set (match_operand:DI 0 "register_operand" "=r") (high:DI (unspec:DI [(match_operand:DI 1 "symbolic_operand" "")] UNSPEC_SETLM)))] - "TARGET_CM_MEDANY" + "TARGET_CM_MEDANY && !flag_pic" "sethi\t%%lm(%a1), %0") (define_insn "sethm" @@ -1903,49 +1903,49 @@ (lo_sum:DI (match_operand:DI 1 "register_operand" "r") (unspec:DI [(match_operand:DI 2 "symbolic_operand" "")] UNSPEC_EMB_SETHM)))] - "TARGET_CM_MEDANY" + "TARGET_CM_MEDANY && !flag_pic" "or\t%1, %%hm(%a2), %0") (define_insn "setlo" [(set (match_operand:DI 0 "register_operand" "=r") (lo_sum:DI (match_operand:DI 1 "register_operand" "r") (match_operand:DI 2 "symbolic_operand" "")))] - "TARGET_CM_MEDANY" + "TARGET_CM_MEDANY && !flag_pic" "or\t%1, %%lo(%a2), %0") (define_insn "embmedany_sethi" [(set (match_operand:DI 0 "register_operand" "=r") (high:DI (unspec:DI [(match_operand:DI 1 "data_segment_operand" "")] UNSPEC_EMB_HISUM)))] - "TARGET_CM_EMBMEDANY && check_pic (1)" + "TARGET_CM_EMBMEDANY && !flag_pic" "sethi\t%%hi(%a1), %0") (define_insn "embmedany_losum" [(set (match_operand:DI 0 "register_operand" "=r") (lo_sum:DI (match_operand:DI 1 "register_operand" "r") (match_operand:DI 2 "data_segment_operand" "")))] - "TARGET_CM_EMBMEDANY" + "TARGET_CM_EMBMEDANY && !flag_pic" "add\t%1, %%lo(%a2), %0") (define_insn "embmedany_brsum" [(set (match_operand:DI 0 "register_operand" "=r") (unspec:DI [(match_operand:DI 1 "register_operand" "r")] UNSPEC_EMB_HISUM))] - "TARGET_CM_EMBMEDANY" + "TARGET_CM_EMBMEDANY && !flag_pic" "add\t%1, %_, %0") (define_insn "embmedany_textuhi" [(set (match_operand:DI 0 "register_operand" "=r") (high:DI (unspec:DI [(match_operand:DI 1 "text_segment_operand" "")] UNSPEC_EMB_TEXTUHI)))] - "TARGET_CM_EMBMEDANY && check_pic (1)" + "TARGET_CM_EMBMEDANY && !flag_pic" "sethi\t%%uhi(%a1), %0") (define_insn "embmedany_texthi" [(set (match_operand:DI 0 "register_operand" "=r") (high:DI (unspec:DI [(match_operand:DI 1 "text_segment_operand" "")] UNSPEC_EMB_TEXTHI)))] - "TARGET_CM_EMBMEDANY && check_pic (1)" + "TARGET_CM_EMBMEDANY && !flag_pic" "sethi\t%%hi(%a1), %0") (define_insn "embmedany_textulo" @@ -1953,14 +1953,14 @@ (lo_sum:DI (match_operand:DI 1 "register_operand" "r") (unspec:DI [(match_operand:DI 2 "text_segment_operand" "")] UNSPEC_EMB_TEXTULO)))] - "TARGET_CM_EMBMEDANY" + "TARGET_CM_EMBMEDANY && !flag_pic" "or\t%1, %%ulo(%a2), %0") (define_insn "embmedany_textlo" [(set (match_operand:DI 0 "register_operand" "=r") (lo_sum:DI (match_operand:DI 1 "register_operand" "r") (match_operand:DI 2 "text_segment_operand" "")))] - "TARGET_CM_EMBMEDANY" + "TARGET_CM_EMBMEDANY && !flag_pic" "or\t%1, %%lo(%a2), %0") ;; Now some patterns to help reload out a bit. @@ -1968,9 +1968,7 @@ [(parallel [(match_operand:DI 0 "register_operand" "=r") (match_operand:DI 1 "immediate_operand" "") (match_operand:TI 2 "register_operand" "=&r")])] - "(TARGET_CM_MEDANY - || TARGET_CM_EMBMEDANY) - && !flag_pic" + "(TARGET_CM_MEDANY || TARGET_CM_EMBMEDANY) && !flag_pic" { sparc_emit_set_symbolic_const64 (operands[0], operands[1], operands[2]); DONE; @@ -1980,9 +1978,7 @@ [(parallel [(match_operand:DI 0 "register_operand" "=r") (match_operand:DI 1 "immediate_operand" "") (match_operand:TI 2 "register_operand" "=&r")])] - "(TARGET_CM_MEDANY - || TARGET_CM_EMBMEDANY) - && !flag_pic" + "(TARGET_CM_MEDANY || TARGET_CM_EMBMEDANY) && !flag_pic" { sparc_emit_set_symbolic_const64 (operands[0], operands[1], operands[2]); DONE; diff --git a/gcc/config/sparc/sparc.opt b/gcc/config/sparc/sparc.opt index 1be7800..973fe6f 100644 --- a/gcc/config/sparc/sparc.opt +++ b/gcc/config/sparc/sparc.opt @@ -57,6 +57,10 @@ msoft-quad-float Target Report RejectNegative InverseMask(HARD_QUAD) Do not use hardware quad fp instructions. +mlra +Target Report Mask(LRA) +Enable Local Register Allocation. + mv8plus Target Report Mask(V8PLUS) Compile for V8+ ABI. -- cgit v1.1 From 859faa171ebabdddf364564acad99750cf2b6f56 Mon Sep 17 00:00:00 2001 From: David Malcolm Date: Thu, 1 Dec 2016 21:56:09 +0000 Subject: dwarf2out.c: fix jit issue with early_dwarf_finished All of the jit testcases that generate debuginfo appear to have been failing since r240228 on their 2nd in-process iteration on this assertion in set_early_dwarf's ctor: gcc_assert (! early_dwarf_finished); Root cause is that the global is never reset at the end of compilation, which this patch fixes in the obvious way. gcc/ChangeLog: * dwarf2out.c (dwarf2out_c_finalize): Reset early_dwarf and early_dwarf_finished. From-SVN: r243136 --- gcc/ChangeLog | 5 +++++ gcc/dwarf2out.c | 3 +++ 2 files changed, 8 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5a55f42..b23481f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 David Malcolm + + * dwarf2out.c (dwarf2out_c_finalize): Reset early_dwarf and + early_dwarf_finished. + 2016-12-01 Eric Botcazou David S. Miller diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c index bc328ab..8dc8523 100644 --- a/gcc/dwarf2out.c +++ b/gcc/dwarf2out.c @@ -29830,6 +29830,9 @@ dwarf2out_c_finalize (void) cold_text_section = NULL; current_unit_personality = NULL; + early_dwarf = false; + early_dwarf_finished = false; + next_die_offset = 0; single_comp_unit_die = NULL; comdat_type_list = NULL; -- cgit v1.1 From c1ff51dc9f743fb6ecd77a9374e543d285f98cb0 Mon Sep 17 00:00:00 2001 From: Jason Merrill Date: Thu, 1 Dec 2016 17:10:57 -0500 Subject: fix PR number From-SVN: r243137 --- gcc/cp/ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog index 35db0db..1a9a1ed 100644 --- a/gcc/cp/ChangeLog +++ b/gcc/cp/ChangeLog @@ -216,7 +216,7 @@ IMAGPART_EXPR can be lvalues. DR 374 - PR c++/56840 + PR c++/56480 * pt.c (check_specialization_namespace): Allow any enclosing namespace. (check_unqualified_spec_or_inst): New. -- cgit v1.1 From 03e88100e14719f4e05dd379e88ae4daf68fa625 Mon Sep 17 00:00:00 2001 From: Jason Merrill Date: Thu, 1 Dec 2016 17:13:06 -0500 Subject: call.c (add_function_candidate): Exclude inherited copy/move ctors. * call.c (add_function_candidate): Exclude inherited copy/move ctors. From-SVN: r243138 --- gcc/cp/ChangeLog | 5 +++++ gcc/cp/call.c | 19 +++++++++++++++++++ gcc/testsuite/g++.dg/cpp0x/inh-ctor15a.C | 14 -------------- gcc/testsuite/g++.dg/cpp1z/inh-ctor36.C | 18 ++++++++++++++++++ 4 files changed, 42 insertions(+), 14 deletions(-) delete mode 100644 gcc/testsuite/g++.dg/cpp0x/inh-ctor15a.C create mode 100644 gcc/testsuite/g++.dg/cpp1z/inh-ctor36.C diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog index 1a9a1ed..b407d17 100644 --- a/gcc/cp/ChangeLog +++ b/gcc/cp/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 Jason Merrill + + * call.c (add_function_candidate): Exclude inherited copy/move + ctors. + 2016-11-29 David Malcolm PR c++/77922 diff --git a/gcc/cp/call.c b/gcc/cp/call.c index 97003e5..561cc83 100644 --- a/gcc/cp/call.c +++ b/gcc/cp/call.c @@ -2042,6 +2042,25 @@ add_function_candidate (struct z_candidate **candidates, reason = arity_rejection (first_arg, i + remaining, len); } + /* A constructor that is a direct member of a class C and has a first + parameter of type "reference to cv C" (including such a constructor + instantiated from a template) is excluded from the set of candidate + functions when used to construct an object of type derived from C (12.6.3 + [class.inhctor.init]) with an argument list containing a single + argument. */ + if (viable && len == 1 && parmlist && DECL_CONSTRUCTOR_P (fn) + && flag_new_inheriting_ctors + && DECL_INHERITED_CTOR (fn)) + { + tree ptype = non_reference (TREE_VALUE (parmlist)); + tree ctype = DECL_INHERITED_CTOR_BASE (fn); + if (same_type_ignoring_top_level_qualifiers_p (ptype, ctype)) + { + viable = false; + reason = inherited_ctor_rejection (); + } + } + /* Second, for a function to be viable, its constraints must be satisfied. */ if (flag_concepts && viable diff --git a/gcc/testsuite/g++.dg/cpp0x/inh-ctor15a.C b/gcc/testsuite/g++.dg/cpp0x/inh-ctor15a.C deleted file mode 100644 index a9abb84..0000000 --- a/gcc/testsuite/g++.dg/cpp0x/inh-ctor15a.C +++ /dev/null @@ -1,14 +0,0 @@ -// P0136 caused us to start inheriting base copy constructors. -// { dg-do compile { target c++11 } } -// { dg-options -fnew-inheriting-ctors } - -struct A { A(int); }; -struct B: public A -{ - using A::A; -}; - -A a (42); - -B b1 (24); // inherited -B b2 (a); // also inherited now diff --git a/gcc/testsuite/g++.dg/cpp1z/inh-ctor36.C b/gcc/testsuite/g++.dg/cpp1z/inh-ctor36.C new file mode 100644 index 0000000..768a966 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp1z/inh-ctor36.C @@ -0,0 +1,18 @@ +// { dg-do link { target c++11 } } + +struct X { X(X &&); }; +struct A { + A() {} + A(const A&); // #1 + A(A &&) = default; // #2, defined as deleted (12.8 [class.copy]) + template A(T &&); // #3 + union { X x; }; +}; +struct B : A { + using A::A; + B(...) {} +}; + +int main() { + B b = A(); // calls B::B(...): #1, #2, and #3 are excluded from candidate set +} -- cgit v1.1 From c3a2f7405c2f88201b44d2edc3b8651109f0b142 Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Thu, 1 Dec 2016 22:36:49 +0000 Subject: * es.po: Update. From-SVN: r243139 --- gcc/po/ChangeLog | 4 + gcc/po/es.po | 328 ++++++++++++------------------------------------------- 2 files changed, 73 insertions(+), 259 deletions(-) diff --git a/gcc/po/ChangeLog b/gcc/po/ChangeLog index 340d7d4..4b6ca4b 100644 --- a/gcc/po/ChangeLog +++ b/gcc/po/ChangeLog @@ -1,3 +1,7 @@ +2016-12-01 Joseph Myers + + * es.po: Update. + 2016-11-30 Joseph Myers * es.po: Update. diff --git a/gcc/po/es.po b/gcc/po/es.po index e7bfcd0..cf04cb3 100644 --- a/gcc/po/es.po +++ b/gcc/po/es.po @@ -35,7 +35,7 @@ msgstr "" "Project-Id-Version: gcc 6.2.0\n" "Report-Msgid-Bugs-To: http://gcc.gnu.org/bugs.html\n" "POT-Creation-Date: 2016-08-19 21:03+0000\n" -"PO-Revision-Date: 2016-11-30 22:55+0100\n" +"PO-Revision-Date: 2016-12-01 23:28+0100\n" "Last-Translator: Antonio Ceballos \n" "Language-Team: Spanish \n" "Language: es\n" @@ -8105,7 +8105,7 @@ msgstr "Genera código que use las instrucciones de coma flotante del 68881." #: config/m68k/m68k.opt:99 msgid "Align variables on a 32-bit boundary." -msgstr "Alínea las variables en límites de 32-bit." +msgstr "Alinea las variables en límites de 32-bit." #: config/m68k/m68k.opt:103 config/arm/arm.opt:81 config/nios2/nios2.opt:570 #: config/nds32/nds32.opt:66 config/c6x/c6x.opt:67 @@ -8504,152 +8504,102 @@ msgid "Use simple data speculation check for control speculation." msgstr "Usa la revisión de especulación de datos simple para el control de especulación." #: config/ia64/ia64.opt:174 -#, fuzzy -#| msgid "Count speculative dependencies while calculating priority of instructions" msgid "Count speculative dependencies while calculating priority of instructions." msgstr "Cuenta las dependencias especulativas mientras se calcula la prioridad de las instrucciones." #: config/ia64/ia64.opt:178 -#, fuzzy -#| msgid "Place a stop bit after every cycle when scheduling" msgid "Place a stop bit after every cycle when scheduling." -msgstr "Coloca un bit de parada después de cada ciclo durante la calendarización." +msgstr "Coloca un bit de parada después de cada ciclo durante la planificación." #: config/ia64/ia64.opt:182 -#, fuzzy -#| msgid "Assume that floating-point stores and loads are not likely to cause conflict when placed into one instruction group" msgid "Assume that floating-point stores and loads are not likely to cause conflict when placed into one instruction group." -msgstr "Asume que los stores y loads de coma flotante no pueden causar conflictos al colocarse en un grupo de instrucción." +msgstr "Asume que los stores y loads de coma flotante no es probable que provoquen conflictos al colocarse en un grupo de instrucción." #: config/ia64/ia64.opt:186 -#, fuzzy -#| msgid "Soft limit on number of memory insns per instruction group, giving lower priority to subsequent memory insns attempting to schedule in the same insn group. Frequently useful to prevent cache bank conflicts. Default value is 1" msgid "Soft limit on number of memory insns per instruction group, giving lower priority to subsequent memory insns attempting to schedule in the same insn group. Frequently useful to prevent cache bank conflicts. Default value is 1." -msgstr "Límite suave en el número de insns de memoria por grupo de instrucción, dando una prioridad más baja a insns de memoria subsecuentes que intenten calendarizar en el mismo grupo insn. Frecuentemente útil para prevenir conflictos en el banco de caché. El valor por defecto es 1." +msgstr "Límite suave en el número de insns de memoria por grupo de instrucción, dando una prioridad más baja a insns de memoria subsiguientes que intenten planificar en el mismo grupo insn. Frecuentemente útil para prevenir conflictos en el banco de caché. El valor por defecto es 1." #: config/ia64/ia64.opt:190 -#, fuzzy -#| msgid "Disallow more than 'msched-max-memory-insns' in instruction group. Otherwise, limit is 'soft' (prefer non-memory operations when limit is reached)" msgid "Disallow more than 'msched-max-memory-insns' in instruction group. Otherwise, limit is 'soft' (prefer non-memory operations when limit is reached)." msgstr "Desactiva más de 'msched-max-memory-insns' en el grupo de instrucción. De otra forma, el límite es 'soft' (se prefieren operaciones que no sean de memoria cuando se alcanza el límite)." #: config/ia64/ia64.opt:194 -#, fuzzy -#| msgid "Don't generate checks for control speculation in selective scheduling" msgid "Don't generate checks for control speculation in selective scheduling." -msgstr "No genera revisiones para el control de especulación en la calendarización selectiva." +msgstr "No genera revisiones para el control de especulación en la planificación selectiva." #: config/spu/spu.opt:20 -#, fuzzy -#| msgid "Emit warnings when run-time relocations are generated" msgid "Emit warnings when run-time relocations are generated." msgstr "Emite avisos cuando se generan las reubicaciones de tiempo de ejecución." #: config/spu/spu.opt:24 -#, fuzzy -#| msgid "Emit errors when run-time relocations are generated" msgid "Emit errors when run-time relocations are generated." msgstr "Emite errores cuando se generan las reubicaciones de tiempo de ejecución." #: config/spu/spu.opt:28 -#, fuzzy -#| msgid "Specify cost of branches (Default 20)" msgid "Specify cost of branches (Default 20)." msgstr "Especifica el costo de las ramificaciones (20 por defecto)." #: config/spu/spu.opt:32 -#, fuzzy -#| msgid "Make sure loads and stores are not moved past DMA instructions" msgid "Make sure loads and stores are not moved past DMA instructions." msgstr "Se asegura que las instrucciones load y store no se mueven después de las instrucciones DMA." #: config/spu/spu.opt:36 -#, fuzzy -#| msgid "volatile must be specified on any memory that is effected by DMA" msgid "volatile must be specified on any memory that is effected by DMA." -msgstr "Se debe especificar volatile en cualquier memoria que sea afectada por DMA." +msgstr "se debe especificar volatile en cualquier memoria que sea afectada por DMA." #: config/spu/spu.opt:40 config/spu/spu.opt:44 -#, fuzzy -#| msgid "Insert nops when it might improve performance by allowing dual issue (default)" msgid "Insert nops when it might improve performance by allowing dual issue (default)." msgstr "Inserta nops cuando se puede mejorar el rendimiento permitiendo el asunto dual (por defecto)." #: config/spu/spu.opt:48 -#, fuzzy -#| msgid "Use standard main function as entry for startup" msgid "Use standard main function as entry for startup." msgstr "Usa la función main estándar como entrada para el inicio." #: config/spu/spu.opt:52 -#, fuzzy -#| msgid "Generate branch hints for branches" msgid "Generate branch hints for branches." msgstr "Genera pistas de ramificación para las ramificaciones." #: config/spu/spu.opt:56 -#, fuzzy -#| msgid "Maximum number of nops to insert for a hint (Default 2)" msgid "Maximum number of nops to insert for a hint (Default 2)." msgstr "Número máximo de nops a insertar para una pista (Por defecto 2)." #: config/spu/spu.opt:60 -#, fuzzy -#| msgid "Approximate maximum number of instructions to allow between a hint and its branch [125]" msgid "Approximate maximum number of instructions to allow between a hint and its branch [125]." msgstr "El número máximo aproximado de instrucciones a permitir entre una pista y su ramificación [125]." #: config/spu/spu.opt:64 -#, fuzzy -#| msgid "Generate code for 18 bit addressing" msgid "Generate code for 18 bit addressing." msgstr "Genera código para direccionamiento de 18 bit." #: config/spu/spu.opt:68 -#, fuzzy -#| msgid "Generate code for 32 bit addressing" msgid "Generate code for 32 bit addressing." msgstr "Genera código para direccionamiento de 32 bit." #: config/spu/spu.opt:76 -#, fuzzy -#| msgid "Insert hbrp instructions after hinted branch targets to avoid the SPU hang issue" msgid "Insert hbrp instructions after hinted branch targets to avoid the SPU hang issue." msgstr "Inserta instrucciones hbrp después de los objetivos ramificados con pista para evitar el problema del colgado de SPU." #: config/spu/spu.opt:80 config/i386/i386.opt:247 config/s390/s390.opt:56 -#, fuzzy -#| msgid "Generate code for given CPU" msgid "Generate code for given CPU." -msgstr "Genera código para el CPU dado." +msgstr "Genera código para la CPU dada." #: config/spu/spu.opt:88 -#, fuzzy -#| msgid "Access variables in 32-bit PPU objects (default)" msgid "Access variables in 32-bit PPU objects (default)." msgstr "Accede a las variables en objetos PPU de 32-bit (por defecto)." #: config/spu/spu.opt:92 -#, fuzzy -#| msgid "Access variables in 64-bit PPU objects" msgid "Access variables in 64-bit PPU objects." -msgstr "Accede a las varialbes en objetos PPU de 64-bit." +msgstr "Accede a las variables en objetos PPU de 64-bit." #: config/spu/spu.opt:96 -#, fuzzy -#| msgid "Allow conversions between __ea and generic pointers (default)" msgid "Allow conversions between __ea and generic pointers (default)." msgstr "Permite las conversiones entre __ea y punteros genéricos (por defecto)." #: config/spu/spu.opt:100 -#, fuzzy -#| msgid "Size (in KB) of software data cache" msgid "Size (in KB) of software data cache." msgstr "Tamaño (en KB) del caché de datos de software." #: config/spu/spu.opt:104 -#, fuzzy -#| msgid "Atomically write back software data cache lines (default)" msgid "Atomically write back software data cache lines (default)." msgstr "Escribe hacia atrás atómicamente las líneas de caché de datos de software (por defecto)." @@ -8662,8 +8612,6 @@ msgid "preferentially allocate registers that allow short instruction generation msgstr "aloja de preferencia registros que permitan la generación de instrucciones short." #: config/epiphany/epiphany.opt:32 -#, fuzzy -#| msgid "Set branch cost" msgid "Set branch cost." msgstr "Establece el costo de ramificación." @@ -8672,32 +8620,22 @@ msgid "enable conditional move instruction usage." msgstr "activa el uso de la instrucción move condicional." #: config/epiphany/epiphany.opt:40 -#, fuzzy -#| msgid "set number of nops to emit before each insn pattern" msgid "set number of nops to emit before each insn pattern." msgstr "establece el número de nops a emitir antes de cada patrón insn." #: config/epiphany/epiphany.opt:52 -#, fuzzy -#| msgid "Use software floating point comparisons" msgid "Use software floating point comparisons." msgstr "Usa comparaciones de coma flotante de software." #: config/epiphany/epiphany.opt:56 -#, fuzzy -#| msgid "Enable split of 32 bit immediate loads into low / high part" msgid "Enable split of 32 bit immediate loads into low / high part." msgstr "Activa la división de loads inmediatos de 32 bit en partes low / high." #: config/epiphany/epiphany.opt:60 -#, fuzzy -#| msgid "Enable use of POST_INC / POST_DEC" msgid "Enable use of POST_INC / POST_DEC." msgstr "Activa el uso de POST_INC / POST_DEC." #: config/epiphany/epiphany.opt:64 -#, fuzzy -#| msgid "Enable use of POST_MODIFY" msgid "Enable use of POST_MODIFY." msgstr "Activa el uso de POST_MODIFY." @@ -8707,17 +8645,13 @@ msgstr "Establece el número prealojado de bytes en la pila para que use el llam #: config/epiphany/epiphany.opt:72 msgid "Assume round to nearest is selected for purposes of scheduling." -msgstr "Asume que está seleccionado el redondeo al más cercano para propósitos de calendarización." +msgstr "Asume que está seleccionado el redondeo al más cercano para propósitos de planificación." #: config/epiphany/epiphany.opt:76 -#, fuzzy -#| msgid "Generate call insns as indirect calls" msgid "Generate call insns as indirect calls." msgstr "Genera las llamadas insns como llamadas indirectas." #: config/epiphany/epiphany.opt:80 -#, fuzzy -#| msgid "Generate call insns as direct calls" msgid "Generate call insns as direct calls." msgstr "Genera las llamadas insns como llamadas directas." @@ -8738,10 +8672,8 @@ msgid "Split unaligned 8 byte vector moves before post-modify address generation msgstr "Divide moves vectoriales de 8 byte sin alinear antes de post-modificar la generación de dirección." #: config/epiphany/epiphany.opt:132 -#, fuzzy -#| msgid "Use hardware floating point conversion instructions" msgid "Use the floating point unit for integer add/subtract." -msgstr "Usa instrucciones de conversión de coma flotante de hardware." +msgstr "Usa la unidad de coma flotante para suma/resta de enteros." #: config/epiphany/epiphany.opt:136 msgid "Set register to hold -1." @@ -8749,252 +8681,174 @@ msgstr "Establece el registro para conservar -1." #: config/ft32/ft32.opt:23 msgid "target the software simulator." -msgstr "" +msgstr "destina al simulador software." #: config/ft32/ft32.opt:27 config/s390/s390.opt:201 config/mips/mips.opt:385 -#, fuzzy -#| msgid "Use ROM instead of RAM" msgid "Use LRA instead of reload." -msgstr "Usa la ROM en lugar de la RAM." +msgstr "Usa la LRA en lugar de recarga." #: config/ft32/ft32.opt:31 -#, fuzzy -#| msgid "Allow the use of MDMX instructions" msgid "Avoid use of the DIV and MOD instructions" -msgstr "Permite el uso de las instrucciones MDMX" +msgstr "Evita el uso de las instrucciones DIV y MOD" #: config/h8300/h8300.opt:23 -#, fuzzy -#| msgid "Generate H8S code" msgid "Generate H8S code." msgstr "Genera código H8S." #: config/h8300/h8300.opt:27 -#, fuzzy -#| msgid "Generate H8SX code" msgid "Generate H8SX code." msgstr "Genera código H8SX." #: config/h8300/h8300.opt:31 -#, fuzzy -#| msgid "Generate H8S/2600 code" msgid "Generate H8S/2600 code." msgstr "Genera código H8S/2600." #: config/h8300/h8300.opt:35 -#, fuzzy -#| msgid "Make integers 32 bits wide" msgid "Make integers 32 bits wide." msgstr "Hace los enteros de 32 bits de anchura." #: config/h8300/h8300.opt:42 -#, fuzzy -#| msgid "Use registers for argument passing" msgid "Use registers for argument passing." msgstr "Usa registros para el paso de parámetros." #: config/h8300/h8300.opt:46 -#, fuzzy -#| msgid "Consider access to byte sized memory slow" msgid "Consider access to byte sized memory slow." msgstr "Considera lento el acceso a memoria de tamaño byte." #: config/h8300/h8300.opt:50 -#, fuzzy -#| msgid "Enable linker relaxing" msgid "Enable linker relaxing." msgstr "Activa la relajación del enlazador." #: config/h8300/h8300.opt:54 -#, fuzzy -#| msgid "Generate H8/300H code" msgid "Generate H8/300H code." msgstr "Genera código H8/300H." #: config/h8300/h8300.opt:58 -#, fuzzy -#| msgid "Enable the normal mode" msgid "Enable the normal mode." -msgstr "Activa el modelo normal." +msgstr "Activa el modo normal." #: config/h8300/h8300.opt:62 -#, fuzzy -#| msgid "Use H8/300 alignment rules" msgid "Use H8/300 alignment rules." msgstr "Usa las reglas de alineación H8/300." #: config/h8300/h8300.opt:66 msgid "Push extended registers on stack in monitor functions." -msgstr "" +msgstr "Empuja los registros extendidos a la pila en las funciones de monitorización." #: config/h8300/h8300.opt:70 msgid "Do not push extended registers on stack in monitor functions." -msgstr "" +msgstr "No empuja los registros extendidos a la pila en las funciones de monitorización." #: config/pdp11/pdp11.opt:23 -#, fuzzy -#| msgid "Generate code for an 11/10" msgid "Generate code for an 11/10." msgstr "Genera código para un 11/10." #: config/pdp11/pdp11.opt:27 -#, fuzzy -#| msgid "Generate code for an 11/40" msgid "Generate code for an 11/40." msgstr "Genera código para un 11/40." #: config/pdp11/pdp11.opt:31 -#, fuzzy -#| msgid "Generate code for an 11/45" msgid "Generate code for an 11/45." msgstr "Genera código para un 11/45." #: config/pdp11/pdp11.opt:35 -#, fuzzy -#| msgid "Return floating-point results in ac0 (fr0 in Unix assembler syntax)" msgid "Return floating-point results in ac0 (fr0 in Unix assembler syntax)." msgstr "Devuelve los resultados de coma flotante en ac0 (fr0 en sintaxis de ensamblador Unix)." #: config/pdp11/pdp11.opt:39 -#, fuzzy -#| msgid "Do not use inline patterns for copying memory" msgid "Do not use inline patterns for copying memory." msgstr "No usa patrones incluidos en línea para copiado de memoria." #: config/pdp11/pdp11.opt:43 -#, fuzzy -#| msgid "Use inline patterns for copying memory" msgid "Use inline patterns for copying memory." msgstr "Usa patrones incluidos en línea para copiado de memoria." #: config/pdp11/pdp11.opt:47 -#, fuzzy -#| msgid "Do not pretend that branches are expensive" msgid "Do not pretend that branches are expensive." msgstr "No pretende que las ramificaciones son costosas." #: config/pdp11/pdp11.opt:51 -#, fuzzy -#| msgid "Pretend that branches are expensive" msgid "Pretend that branches are expensive." msgstr "Pretende que las ramificaciones son costosas." #: config/pdp11/pdp11.opt:55 -#, fuzzy -#| msgid "Use the DEC assembler syntax" msgid "Use the DEC assembler syntax." msgstr "Usa la sintaxis de ensamblador DEC." #: config/pdp11/pdp11.opt:59 -#, fuzzy -#| msgid "Use 32 bit float" msgid "Use 32 bit float." msgstr "Usa float de 32 bit." #: config/pdp11/pdp11.opt:63 -#, fuzzy -#| msgid "Use 64 bit float" msgid "Use 64 bit float." msgstr "Usa float de 64 bit." #: config/pdp11/pdp11.opt:67 config/rs6000/rs6000.opt:177 #: config/frv/frv.opt:158 -#, fuzzy -#| msgid "Use hardware floating point" msgid "Use hardware floating point." msgstr "Usa coma flotante de hardware." #: config/pdp11/pdp11.opt:71 -#, fuzzy -#| msgid "Use 16 bit int" msgid "Use 16 bit int." msgstr "Usa int de 16 bit." #: config/pdp11/pdp11.opt:75 -#, fuzzy -#| msgid "Use 32 bit int" msgid "Use 32 bit int." msgstr "Usa int de 32 bit." #: config/pdp11/pdp11.opt:79 config/rs6000/rs6000.opt:173 -#, fuzzy -#| msgid "Do not use hardware floating point" msgid "Do not use hardware floating point." msgstr "No usa coma flotante de hardware." #: config/pdp11/pdp11.opt:83 -#, fuzzy -#| msgid "Target has split I&D" msgid "Target has split I&D." msgstr "El objetivo tiene I&D dividido." #: config/pdp11/pdp11.opt:87 -#, fuzzy -#| msgid "Use UNIX assembler syntax" msgid "Use UNIX assembler syntax." msgstr "Usa sintaxis de ensamblador UNIX." #: config/xtensa/xtensa.opt:23 -#, fuzzy -#| msgid "Use CONST16 instruction to load constants" msgid "Use CONST16 instruction to load constants." msgstr "Usa la instrucción CONST16 para cargar constantes." #: config/xtensa/xtensa.opt:27 -#, fuzzy -#| msgid "Disable position-independent code (PIC) for use in OS kernel code" msgid "Disable position-independent code (PIC) for use in OS kernel code." msgstr "Desactiva el código independiente de posición (PIC) para su uso en código de núcleo de SO." #: config/xtensa/xtensa.opt:31 -#, fuzzy -#| msgid "Use indirect CALLXn instructions for large programs" msgid "Use indirect CALLXn instructions for large programs." msgstr "Usa las instrucciones CALLXn indirectas para programas grandes." #: config/xtensa/xtensa.opt:35 -#, fuzzy -#| msgid "Automatically align branch targets to reduce branch penalties" msgid "Automatically align branch targets to reduce branch penalties." -msgstr "Alínea automáticamente los objetivos de las ramificaciones para reducir las penas de ramificación." +msgstr "Alinea automáticamente los objetivos de las ramificaciones para reducir las penas de ramificación." #: config/xtensa/xtensa.opt:39 -#, fuzzy -#| msgid "Intersperse literal pools with code in the text section" msgid "Intersperse literal pools with code in the text section." msgstr "Dispersa los conjuntos de literales con código en la sección de texto." #: config/xtensa/xtensa.opt:43 msgid "Relax literals in assembler and place them automatically in the text section." -msgstr "" +msgstr "Relaja los literales en ensamblador y los coloca automáticamente en la sección de texto." #: config/xtensa/xtensa.opt:47 -#, fuzzy -#| msgid "-mno-serialize-volatile\tDo not serialize volatile memory references with MEMW instructions" msgid "-mno-serialize-volatile\tDo not serialize volatile memory references with MEMW instructions." msgstr "-mno-serialize-volatile\tNo serializa las referencias a memoria volátil con instrucciones MEMW." #: config/i386/cygming.opt:23 -#, fuzzy -#| msgid "Create console application" msgid "Create console application." msgstr "Crea una aplicación de consola." #: config/i386/cygming.opt:27 -#, fuzzy -#| msgid "Generate code for a DLL" msgid "Generate code for a DLL." msgstr "Genera código para una DLL." #: config/i386/cygming.opt:31 -#, fuzzy -#| msgid "Ignore dllimport for functions" msgid "Ignore dllimport for functions." msgstr "Ignora dllimport para funciones." #: config/i386/cygming.opt:35 -#, fuzzy -#| msgid "Use Mingw-specific thread support" msgid "Use Mingw-specific thread support." msgstr "Usa el soporte de hilos específico de Mingw." @@ -9005,132 +8859,94 @@ msgid "Set Windows defines." msgstr "Establece las definiciones de Windows." #: config/i386/cygming.opt:43 -#, fuzzy -#| msgid "Create GUI application" msgid "Create GUI application." msgstr "Crea una aplicación con interfaz gráfica de usuario (GUI)." #: config/i386/cygming.opt:47 config/i386/interix.opt:32 -#, fuzzy -#| msgid "Use the GNU extension to the PE format for aligned common data" msgid "Use the GNU extension to the PE format for aligned common data." msgstr "Usa la extensión GNU para el formato PE para los datos comunes alineados." #: config/i386/cygming.opt:51 -#, fuzzy -#| msgid "Compile code that relies on Cygwin DLL wrappers to support C++ operator new/delete replacement" msgid "Compile code that relies on Cygwin DLL wrappers to support C++ operator new/delete replacement." msgstr "Compila código que depende de las envolturas DLL de Cygwin para admitir el reemplazo de los operadores de C++ new/delete." #: config/i386/cygming.opt:58 msgid "Put relocated read-only data into .data section." -msgstr "" +msgstr "Pone los datos de solo lectura reubicados en la sección .data." #: config/i386/mingw.opt:29 -#, fuzzy -#| msgid "Warn about none ISO msvcrt scanf/printf width extensions" msgid "Warn about none ISO msvcrt scanf/printf width extensions." -msgstr "Avisa sobre extensiones de anchura scanf/printf msvcrt que no son ISO." +msgstr "Advierte de extensiones de anchura scanf/printf msvcrt que no son ISO." #: config/i386/mingw.opt:33 msgid "For nested functions on stack executable permission is set." msgstr "Se establece el permiso ejecutable para las funciones anidadas en la pila." #: config/i386/mingw-w64.opt:23 -#, fuzzy -#| msgid "Use unicode startup and define UNICODE macro" msgid "Use unicode startup and define UNICODE macro." msgstr "Usa el inicio de unicode y define la macro UNICODE." #: config/i386/i386.opt:182 -#, fuzzy -#| msgid "sizeof(long double) is 16" msgid "sizeof(long double) is 16." msgstr "sizeof(long double) es 16." #: config/i386/i386.opt:186 config/i386/i386.opt:354 -#, fuzzy -#| msgid "Use hardware fp" msgid "Use hardware fp." msgstr "Usa fp de hardware." #: config/i386/i386.opt:190 -#, fuzzy -#| msgid "sizeof(long double) is 12" msgid "sizeof(long double) is 12." msgstr "sizeof(long double) es 12." #: config/i386/i386.opt:194 -#, fuzzy -#| msgid "Use 128-bit long double" msgid "Use 80-bit long double." -msgstr "Usa long doubles de 128 bits." +msgstr "Usa long doubles de 80 bits." #: config/i386/i386.opt:198 config/s390/s390.opt:130 #: config/sparc/long-double-switch.opt:27 config/alpha/alpha.opt:102 #, fuzzy -#| msgid "Use 64-bit long double" +#| msgid "Use 80-bit long double." msgid "Use 64-bit long double." -msgstr "Usa long doubles de 64 bits." +msgstr "Usa long doubles de 80 bits." #: config/i386/i386.opt:202 config/s390/s390.opt:126 #: config/sparc/long-double-switch.opt:23 config/alpha/alpha.opt:98 -#, fuzzy -#| msgid "Use 128-bit long double" msgid "Use 128-bit long double." msgstr "Usa long doubles de 128 bits." #: config/i386/i386.opt:206 config/sh/sh.opt:209 -#, fuzzy -#| msgid "Reserve space for outgoing arguments in the function prologue" msgid "Reserve space for outgoing arguments in the function prologue." msgstr "Reserva espacio para los argumentos de salida en el prólogo de la función." #: config/i386/i386.opt:210 -#, fuzzy -#| msgid "Align some doubles on dword boundary" msgid "Align some doubles on dword boundary." -msgstr "Alínea algunos doubles en límites de dword." +msgstr "Alinea algunos doubles en límites de dword." #: config/i386/i386.opt:214 -#, fuzzy -#| msgid "Function starts are aligned to this power of 2" msgid "Function starts are aligned to this power of 2." msgstr "Los inicios de las funciones se alinean a esta potencia de 2." #: config/i386/i386.opt:218 -#, fuzzy -#| msgid "Jump targets are aligned to this power of 2" msgid "Jump targets are aligned to this power of 2." msgstr "Los objetivos de salto se alinean a esta potencia de 2." #: config/i386/i386.opt:222 -#, fuzzy -#| msgid "Loop code aligned to this power of 2" msgid "Loop code aligned to this power of 2." msgstr "El código de ciclo se alinea a esta potencia de 2." #: config/i386/i386.opt:226 -#, fuzzy -#| msgid "Align destination of the string operations" msgid "Align destination of the string operations." -msgstr "Alínea el destino de las operaciones de cadenas." +msgstr "Alinea el destino de las operaciones de cadenas." #: config/i386/i386.opt:230 -#, fuzzy -#| msgid "Do not tune writable data alignment" msgid "Use the given data alignment." -msgstr "No ajusta la alineación de los datos modificables." +msgstr "Usa la alineación de los datos dada." #: config/i386/i386.opt:234 -#, fuzzy -#| msgid "Known TLS dialects (for use with the -mtls-dialect= option):" msgid "Known data alignment choices (for use with the -malign-data= option):" -msgstr "Dialectos TLS conocidos (para usar con la opción -mtls-dialect=):" +msgstr "Las opciones conocidas para alineamiento de datos (para usar con la opción -malign-data=):" #: config/i386/i386.opt:251 -#, fuzzy -#| msgid "Use given assembler dialect" msgid "Use given assembler dialect." msgstr "Usa el dialecto de ensamblador dado." @@ -9139,20 +8955,14 @@ msgid "Known assembler dialects (for use with the -masm-dialect= option):" msgstr "Dialectos de ensamblador conocidos (para uso con la opción -masm-dialect=):" #: config/i386/i386.opt:265 -#, fuzzy -#| msgid "Branches are this expensive (1-5, arbitrary units)" msgid "Branches are this expensive (1-5, arbitrary units)." msgstr "Las ramificaciones son así de caras (1-5, unidades arbitrarias)." #: config/i386/i386.opt:269 -#, fuzzy -#| msgid "Data greater than given threshold will go into .ldata section in x86-64 medium model" msgid "Data greater than given threshold will go into .ldata section in x86-64 medium model." msgstr "Los datos más grandes que el límite dado irán a la sección .ldata en el modeolo medium del x86-64." #: config/i386/i386.opt:273 -#, fuzzy -#| msgid "Use given x86-64 code model" msgid "Use given x86-64 code model." msgstr "Usa el modelo de código del x86-64 dado." @@ -9297,7 +9107,7 @@ msgstr "Usa las convenciones de paso de registro SSE para los modos SF y DF." #, fuzzy #| msgid "Realign stack in prologue" msgid "Realign stack in prologue." -msgstr "Realínea la pila en el prólogo." +msgstr "Realinea la pila en el prólogo." #: config/i386/i386.opt:442 #, fuzzy @@ -9405,7 +9215,7 @@ msgstr "" #, fuzzy #| msgid "Do dispatch scheduling if processor is bdver1 or bdver2 and Haifa scheduling" msgid "Do dispatch scheduling if processor is bdver1, bdver2, bdver3, bdver4" -msgstr "Despacha al planificador si el procesador es bdver1 o bdver2 y la calendarización es Haifa" +msgstr "Despacha al planificador si el procesador es bdver1 o bdver2 y la planificación es Haifa" #: config/i386/i386.opt:582 msgid "Use 128-bit AVX instructions instead of 256-bit AVX instructions in the auto-vectorizer." @@ -9950,7 +9760,7 @@ msgstr "Usa convenciones de llamada transportable." #, fuzzy #| msgid "Specify CPU for scheduling purposes. Valid arguments are 700, 7100, 7100LC, 7200, 7300, and 8000" msgid "Specify CPU for scheduling purposes. Valid arguments are 700, 7100, 7100LC, 7200, 7300, and 8000." -msgstr "Especifica el CPU por razones de calendarización. Los argumentos válidos son 700, 7100, 7100LC, 7200, 7300, y 8000." +msgstr "Especifica el CPU por razones de planificación. Los argumentos válidos son 700, 7100, 7100LC, 7200, 7300, y 8000." #: config/pa/pa.opt:132 config/frv/frv.opt:215 #, fuzzy @@ -10343,7 +10153,7 @@ msgstr "Compila para el m32r." #, fuzzy #| msgid "Align all loops to 32 byte boundary" msgid "Align all loops to 32 byte boundary." -msgstr "Alínea todos los bucles al límite de 32 byte." +msgstr "Alinea todos los bucles al límite de 32 byte." #: config/m32r/m32r.opt:50 #, fuzzy @@ -10727,7 +10537,7 @@ msgstr "Almacena nombres de función en el código objeto." #, fuzzy #| msgid "Permit scheduling of a function's prologue sequence" msgid "Permit scheduling of a function's prologue sequence." -msgstr "Permite la calendarización de un secuencia de los prólogos de función." +msgstr "Permite la planificación de un secuencia de los prólogos de función." #: config/arm/arm.opt:179 config/rs6000/rs6000.opt:248 #, fuzzy @@ -11484,7 +11294,7 @@ msgstr "Argumentos válidos para -malign-:" #, fuzzy #| msgid "Specify scheduling priority for dispatch slot restricted insns" msgid "Specify scheduling priority for dispatch slot restricted insns." -msgstr "Especifica la prioridad de calendarización para despachar insns restringidos por ranuras." +msgstr "Especifica la prioridad de planificación para despachar insns restringidos por ranuras." #: config/rs6000/rs6000.opt:504 #, fuzzy @@ -11536,7 +11346,7 @@ msgstr "" #, fuzzy #| msgid "Align destination of the string operations" msgid "Allow sign extension in fusion operations." -msgstr "Alínea el destino de las operaciones de cadenas." +msgstr "Alinea el destino de las operaciones de cadenas." #: config/rs6000/rs6000.opt:562 msgid "Use/do not use vector and scalar instructions added in ISA 2.07." @@ -11658,7 +11468,7 @@ msgstr "Selecciona el método para el manejo de sdata." #, fuzzy #| msgid "Align to the base type of the bit-field" msgid "Align to the base type of the bit-field." -msgstr "Alínea al tipo base del campo de bit." +msgstr "Alinea al tipo base del campo de bit." #: config/rs6000/sysv4.opt:57 config/rs6000/sysv4.opt:61 #, fuzzy @@ -13286,7 +13096,7 @@ msgstr "Especifica el CPU para propósitos de generación de código." #: config/iq2000/iq2000.opt:47 msgid "Specify CPU for scheduling purposes." -msgstr "Especifica el CPU para propósitos de calendarización." +msgstr "Especifica el CPU para propósitos de planificación." #: config/iq2000/iq2000.opt:51 msgid "Known IQ2000 CPUs (for use with the -mcpu= option):" @@ -13390,13 +13200,13 @@ msgstr "No ajusta la alineación del código y de datos de sólo lectura." #, fuzzy #| msgid "Align code and data to 32 bits" msgid "Align code and data to 32 bits." -msgstr "Alínea código y datos a 32 bits." +msgstr "Alinea código y datos a 32 bits." #: config/cris/cris.opt:133 #, fuzzy #| msgid "Don't align items in code or data" msgid "Don't align items in code or data." -msgstr "No alínea los elementos en el código o los datos." +msgstr "No alinea los elementos en el código o los datos." #: config/cris/cris.opt:142 #, fuzzy @@ -13744,7 +13554,7 @@ msgstr "Permite evitar cut2 en SH5." #, fuzzy #| msgid "Align doubles at 64-bit boundaries" msgid "Align doubles at 64-bit boundaries." -msgstr "Alínea doubles en límites de 64-bit." +msgstr "Alinea doubles en límites de 64-bit." #: config/sh/sh.opt:257 #, fuzzy @@ -13970,7 +13780,7 @@ msgstr "Habilita el Coprocesador MeP con registros de 64-bit." #, fuzzy #| msgid "Enable IVC2 scheduling" msgid "Enable IVC2 scheduling." -msgstr "Activa la calendarización IVC2." +msgstr "Activa la planificación IVC2." #: config/mep/mep.opt:71 #, fuzzy @@ -15548,25 +15358,25 @@ msgstr "" #, fuzzy #| msgid "Align the start of functions" msgid "Align the start of functions." -msgstr "Alínea el inicio de las funciones." +msgstr "Alinea el inicio de las funciones." #: common.opt:899 #, fuzzy #| msgid "Align labels which are only reached by jumping" msgid "Align labels which are only reached by jumping." -msgstr "Alínea las etiquetas que solamente se alcanzan saltando." +msgstr "Alinea las etiquetas que solamente se alcanzan saltando." #: common.opt:906 #, fuzzy #| msgid "Align all labels" msgid "Align all labels." -msgstr "Alínea todas las etiquetas." +msgstr "Alinea todas las etiquetas." #: common.opt:913 #, fuzzy #| msgid "Align the start of loops" msgid "Align the start of loops." -msgstr "Alínea el inicio de los bucles." +msgstr "Alinea el inicio de los bucles." #: common.opt:936 #, fuzzy @@ -16427,13 +16237,13 @@ msgstr "-fmessage-length=\tLimita los diagnósticos a caracte #, fuzzy #| msgid "Perform SMS based modulo scheduling before the first scheduling pass" msgid "Perform SMS based modulo scheduling before the first scheduling pass." -msgstr "Realiza la calendarización SMS basada en módulo antes del primer paso de calendarización." +msgstr "Realiza la planificación SMS basada en módulo antes del primer paso de calendarización." #: common.opt:1747 #, fuzzy #| msgid "Perform SMS based modulo scheduling with register moves allowed" msgid "Perform SMS based modulo scheduling with register moves allowed." -msgstr "Realiza la calendarización módulo basada en SMS con movimientos permitidos de registros." +msgstr "Realiza la planificación módulo basada en SMS con movimientos permitidos de registros." #: common.opt:1751 #, fuzzy @@ -16457,7 +16267,7 @@ msgstr "Usa la eliminación de almacenamiento muerto de RTL." #, fuzzy #| msgid "Enable/Disable the traditional scheduling in loops that already passed modulo scheduling" msgid "Enable/Disable the traditional scheduling in loops that already passed modulo scheduling." -msgstr "Activa/Desactiva la calendarización tradicional en bucles que ya pasaron la calendarización módulo." +msgstr "Activa/Desactiva la planificación tradicional en bucles que ya pasaron la calendarización módulo." #: common.opt:1767 #, fuzzy @@ -16693,7 +16503,7 @@ msgstr "" #, fuzzy #| msgid "Enable register pressure sensitive insn scheduling" msgid "Relief of register pressure through live range shrinkage." -msgstr "Activa la calendarización de insn sensible a la presión de registros." +msgstr "Activa la planificación de insn sensible a la presión de registros." #: common.opt:1962 #, fuzzy @@ -16751,13 +16561,13 @@ msgstr "Desactiva las optimizaciones que asumen la conducta de un FP que redonde #, fuzzy #| msgid "Enable scheduling across basic blocks" msgid "Enable scheduling across basic blocks." -msgstr "Activa la calendarización entre bloques básicos." +msgstr "Activa la planificación entre bloques básicos." #: common.opt:2011 #, fuzzy #| msgid "Enable register pressure sensitive insn scheduling" msgid "Enable register pressure sensitive insn scheduling." -msgstr "Activa la calendarización de insn sensible a la presión de registros." +msgstr "Activa la planificación de insn sensible a la presión de registros." #: common.opt:2015 #, fuzzy @@ -16787,7 +16597,7 @@ msgstr "-fsched-verbose=\tEstablece el nivel de detalle del planificado #, fuzzy #| msgid "If scheduling post reload, do superblock scheduling" msgid "If scheduling post reload, do superblock scheduling." -msgstr "Si se calendariza después de la recarga, hace la calendarización de superbloque." +msgstr "Si se calendariza después de la recarga, hace la planificación de superbloque." #: common.opt:2039 #, fuzzy @@ -16805,25 +16615,25 @@ msgstr "Recalendariza las instrucciones después del alojamiento de registros." #, fuzzy #| msgid "Schedule instructions using selective scheduling algorithm" msgid "Schedule instructions using selective scheduling algorithm." -msgstr "Calendariza instrucciones usando el algoritmo de calendarización selectivo." +msgstr "Calendariza instrucciones usando el algoritmo de planificación selectivo." #: common.opt:2054 #, fuzzy #| msgid "Run selective scheduling after reload" msgid "Run selective scheduling after reload." -msgstr "Ejecuta la calendarización selectiva después de recargar." +msgstr "Ejecuta la planificación selectiva después de recargar." #: common.opt:2058 #, fuzzy #| msgid "Perform software pipelining of inner loops during selective scheduling" msgid "Perform software pipelining of inner loops during selective scheduling." -msgstr "Realiza el `pipelining' de software de los bucles internos durante la calendarización selectiva." +msgstr "Realiza el `pipelining' de software de los bucles internos durante la planificación selectiva." #: common.opt:2062 #, fuzzy #| msgid "Perform software pipelining of outer loops during selective scheduling" msgid "Perform software pipelining of outer loops during selective scheduling." -msgstr "Realiza el `pipelining' de software de los bucles externos durante la calendarización selectiva." +msgstr "Realiza el `pipelining' de software de los bucles externos durante la planificación selectiva." #: common.opt:2066 #, fuzzy @@ -16839,7 +16649,7 @@ msgstr "" #, fuzzy #| msgid "Allow premature scheduling of queued insns" msgid "Allow premature scheduling of queued insns." -msgstr "Permite la calendarización prematura de insns encoladas." +msgstr "Permite la planificación prematura de insns encoladas." #: common.opt:2080 #, fuzzy @@ -16851,13 +16661,13 @@ msgstr "-fsched-stalled-insns=\tEstablece el número de insns encoladas #, fuzzy #| msgid "Set dependence distance checking in premature scheduling of queued insns" msgid "Set dependence distance checking in premature scheduling of queued insns." -msgstr "Establece la revisión de distancia de dependencias en la calendarización prematura de insns encoladas." +msgstr "Establece la revisión de distancia de dependencias en la planificación prematura de insns encoladas." #: common.opt:2092 #, fuzzy #| msgid "-fsched-stalled-insns-dep=\tSet dependence distance checking in premature scheduling of queued insns" msgid "-fsched-stalled-insns-dep=\tSet dependence distance checking in premature scheduling of queued insns." -msgstr "-fsched-stalled-insns-dep=\tEstablece la revisión de distancia de dependencias en la calendarización prematura de insns encoladas." +msgstr "-fsched-stalled-insns-dep=\tEstablece la revisión de distancia de dependencias en la planificación prematura de insns encoladas." #: common.opt:2096 #, fuzzy @@ -22813,7 +22623,7 @@ msgstr "-fname-mangling-version ya no tiene soporte" #: toplev.c:1316 #, gcc-internal-format msgid "instruction scheduling not supported on this target machine" -msgstr "no se admite la calendarización de instrucciones en este objetivo" +msgstr "no se admite la planificación de instrucciones en este objetivo" #: toplev.c:1320 #, gcc-internal-format @@ -31375,7 +31185,7 @@ msgstr "la generación de instrucciones Probables a Ramificar está activada, pe #, fuzzy, gcc-internal-format #| msgid "instruction scheduling not supported on this target machine" msgid "CDX instructions are only supported with R2 architecture" -msgstr "no se admite la calendarización de instrucciones en este objetivo" +msgstr "no se admite la planificación de instrucciones en este objetivo" #: config/nios2/nios2.c:1383 #, fuzzy, gcc-internal-format @@ -63748,10 +63558,10 @@ msgstr "se crea un selector para el método %qE que no existe" #~ msgstr "Especifica el rango de registros a convertir en fijos" #~ msgid "If set, data speculative instructions will be chosen for schedule only if there are no other choices at the moment " -#~ msgstr "Si está definido, se escogerán las instrucciones especulativas de datos para calendarización sólo si no hay otras opciones por el momento" +#~ msgstr "Si está definido, se escogerán las instrucciones especulativas de datos para planificación sólo si no hay otras opciones por el momento" #~ msgid "If set, control speculative instructions will be chosen for schedule only if there are no other choices at the moment " -#~ msgstr "Si está definido, se escogerán el control especulativo de instrucciones para calendarización sólo si no hay otras opciones por el momento" +#~ msgstr "Si está definido, se escogerán el control especulativo de instrucciones para planificación sólo si no hay otras opciones por el momento" #~ msgid "Ignored (obsolete)" #~ msgstr "Se descarta (obsoleto)" @@ -63812,7 +63622,7 @@ msgstr "se crea un selector para el método %qE que no existe" #~ msgstr "Soporte para el ABI Green Hills" #~ msgid "Specify CPU for scheduling purposes" -#~ msgstr "Especifica el CPU para propósitos de calendarización" +#~ msgstr "Especifica el CPU para propósitos de planificación" #~ msgid "Specify which type of AE to target. This option sets the mul-type and byte-access." #~ msgstr "Especifica a qué tipo de AE se apunta. Esta opción establece el tipo muly el acceso a byte." @@ -66103,7 +65913,7 @@ msgstr "se crea un selector para el método %qE que no existe" #~ msgstr "faltan argumentos para \"-%s\"" #~ msgid "If scheduling post reload, do trace scheduling" -#~ msgstr "Si se calendariza después de la recarga, hace trazado de calendarización" +#~ msgstr "Si se calendariza después de la recarga, hace trazado de planificación" #~ msgid "(Each undeclared identifier is reported only once" #~ msgstr "(Cada identificador sin declarar solamente se reporta una vez" @@ -68112,7 +67922,7 @@ msgstr "se crea un selector para el método %qE que no existe" #~ "\n" #~ msgid "Maximum number of loops to perform swing modulo scheduling on (mainly for debugging)" -#~ msgstr "Número máximo de bucles que realizan calendarización de cambio de módulo en (principalmente para depuración)" +#~ msgstr "Número máximo de bucles que realizan planificación de cambio de módulo en (principalmente para depuración)" #~ msgid "Given N calls and V call-clobbered vars in a function. Use .GLOBAL_VAR if NxV is larger than this limit" #~ msgstr "Dadas N llamadas y V variables sobreescritas por llamada en una función. Use .GLOBAL_VAR si NxV es mayor que este límite" @@ -69518,7 +69328,7 @@ msgstr "se crea un selector para el método %qE que no existe" #~ msgstr "No establece las definiciones de Windows" #~ msgid "Align doubles on word boundary" -#~ msgstr "Alínea doubles en límites de word" +#~ msgstr "Alinea doubles en límites de word" #~ msgid "Uninitialized locals in .data" #~ msgstr "Locales sin inicializar en .data" @@ -69533,7 +69343,7 @@ msgstr "se crea un selector para el método %qE que no existe" #~ msgstr "No genera sin, cos, sqrt para FPU" #~ msgid "Do not align destination of the string operations" -#~ msgstr "No alínea destino de las operaciones de cadenas" +#~ msgstr "No alinea destino de las operaciones de cadenas" #~ msgid "Do not inline all known string operations" #~ msgstr "No convierte a inline todas las operaciones de cadenas conocidas" @@ -69800,7 +69610,7 @@ msgstr "se crea un selector para el método %qE que no existe" #~ msgstr "Especifica sí/no si se utiliza la coma flotante en los GPRs" #~ msgid "Don't align to the base type of the bit-field" -#~ msgstr "No alínea al tipo base del campo de bit" +#~ msgstr "No alinea al tipo base del campo de bit" #~ msgid "Assume that unaligned accesses are handled by the system" #~ msgstr "Asume que los accesos sin alinear son manejados por el sistema" -- cgit v1.1 From f13d510e361b10bc0c83e211671f158c6665cd8e Mon Sep 17 00:00:00 2001 From: Kelvin Nilsen Date: Thu, 1 Dec 2016 22:52:07 +0000 Subject: re PR target/78577 (Fix define_insn operand types for vexturhlx, vexturhrx, vextuwlx, and vextuwrx patterns) gcc/ChangeLog: 2016-12-01 Kelvin Nilsen PR target/78577 * config/rs6000/vsx.md (vextuhlx): Revise mode of operand 2. (vextuhrx): Likewise. (vextuwlx): Likewise. (vextuwrx): Likewise. From-SVN: r243141 --- gcc/ChangeLog | 8 ++++++++ gcc/config/rs6000/vsx.md | 8 ++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b23481f..7f9dd0e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2016-12-01 Kelvin Nilsen + + PR target/78577 + * config/rs6000/vsx.md (vextuhlx): Revise mode of operand 2. + (vextuhrx): Likewise. + (vextuwlx): Likewise. + (vextuwrx): Likewise. + 2016-12-01 David Malcolm * dwarf2out.c (dwarf2out_c_finalize): Reset early_dwarf and diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index 01d275d..1801bc0 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -3648,7 +3648,7 @@ [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI [(match_operand:SI 1 "register_operand" "r") - (match_operand:V16QI 2 "altivec_register_operand" "v")] + (match_operand:V8HI 2 "altivec_register_operand" "v")] UNSPEC_VEXTUHLX))] "TARGET_P9_VECTOR" "vextuhlx %0,%1,%2" @@ -3659,7 +3659,7 @@ [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI [(match_operand:SI 1 "register_operand" "r") - (match_operand:V16QI 2 "altivec_register_operand" "v")] + (match_operand:V8HI 2 "altivec_register_operand" "v")] UNSPEC_VEXTUHRX))] "TARGET_P9_VECTOR" "vextuhrx %0,%1,%2" @@ -3670,7 +3670,7 @@ [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI [(match_operand:SI 1 "register_operand" "r") - (match_operand:V16QI 2 "altivec_register_operand" "v")] + (match_operand:V4SI 2 "altivec_register_operand" "v")] UNSPEC_VEXTUWLX))] "TARGET_P9_VECTOR" "vextuwlx %0,%1,%2" @@ -3681,7 +3681,7 @@ [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI [(match_operand:SI 1 "register_operand" "r") - (match_operand:V16QI 2 "altivec_register_operand" "v")] + (match_operand:V4SI 2 "altivec_register_operand" "v")] UNSPEC_VEXTUWRX))] "TARGET_P9_VECTOR" "vextuwrx %0,%1,%2" -- cgit v1.1 From b55e6680dec27409fe0e6dc800500564f1a06b53 Mon Sep 17 00:00:00 2001 From: Ma Jiang Date: Thu, 1 Dec 2016 23:02:51 +0000 Subject: acx.m4: Change "tail +16c" to "tail -c +17". * config/acx.m4: Change "tail +16c" to "tail -c +17". * configure: Regenerated. From-SVN: r243142 --- ChangeLog | 5 +++++ config/acx.m4 | 2 +- configure | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index a3320f1..bd2ad55 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 Ma Jiang + + * config/acx.m4: Change "tail +16c" to "tail -c +17". + * configure: Regenerated. + 2016-12-01 Matthias Klose * configure.ac: Don't use pkg-config to check for bdw-gc. diff --git a/config/acx.m4 b/config/acx.m4 index 9ff31eb..ab42972 100644 --- a/config/acx.m4 +++ b/config/acx.m4 @@ -404,7 +404,7 @@ AC_DEFUN([ACX_PROG_CMP_IGNORE_INITIAL], [AC_CACHE_CHECK([how to compare bootstrapped objects], gcc_cv_prog_cmp_skip, [ echo abfoo >t1 echo cdfoo >t2 - gcc_cv_prog_cmp_skip='tail +16c $$f1 > tmp-foo1; tail +16c $$f2 > tmp-foo2; cmp tmp-foo1 tmp-foo2' + gcc_cv_prog_cmp_skip='tail -c +17 $$f1 > tmp-foo1; tail -c +17 $$f2 > tmp-foo2; cmp tmp-foo1 tmp-foo2' if cmp t1 t2 2 2 > /dev/null 2>&1; then if cmp t1 t2 1 1 > /dev/null 2>&1; then : diff --git a/configure b/configure index fb79e73..b6389e4 100755 --- a/configure +++ b/configure @@ -5273,7 +5273,7 @@ if test "${gcc_cv_prog_cmp_skip+set}" = set; then : else echo abfoo >t1 echo cdfoo >t2 - gcc_cv_prog_cmp_skip='tail +16c $$f1 > tmp-foo1; tail +16c $$f2 > tmp-foo2; cmp tmp-foo1 tmp-foo2' + gcc_cv_prog_cmp_skip='tail -c +17 $$f1 > tmp-foo1; tail -c +17 $$f2 > tmp-foo2; cmp tmp-foo1 tmp-foo2' if cmp t1 t2 2 2 > /dev/null 2>&1; then if cmp t1 t2 1 1 > /dev/null 2>&1; then : -- cgit v1.1 From 90ee6453b254cd77819bc30d9d13a3c9828fd1c5 Mon Sep 17 00:00:00 2001 From: Elizebeth Punnoose Date: Thu, 1 Dec 2016 23:11:35 +0000 Subject: re PR fortran/77505 (Negative character length not treated as LEN=0) 2016-12-01 Elizebeth Punnoose PR fortran/77505 * trans-array.c (trans_array_constructor): Treat negative character length as LEN = 0. 2016-12-01 Elizebeth Punnoose PR fortran/77505 * gfortran.dg/char_length_20.f90: New test. * gfortran.dg/char_length_21.f90: Ditto. From-SVN: r243143 --- gcc/fortran/ChangeLog | 6 ++++++ gcc/fortran/trans-array.c | 25 +++++++++++++++++++++++++ gcc/testsuite/ChangeLog | 6 ++++++ gcc/testsuite/gfortran.dg/char_length_20.f90 | 13 +++++++++++++ gcc/testsuite/gfortran.dg/char_length_21.f90 | 11 +++++++++++ 5 files changed, 61 insertions(+) create mode 100644 gcc/testsuite/gfortran.dg/char_length_20.f90 create mode 100644 gcc/testsuite/gfortran.dg/char_length_21.f90 diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index d410392..20a9f2e 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,3 +1,9 @@ +2016-12-01 Elizebeth Punnoose + + PR fortran/77505 + * trans-array.c (trans_array_constructor): Treat negative character + length as LEN = 0. + 2016-12-01 Steven G. Kargl PR fortran/78279 diff --git a/gcc/fortran/trans-array.c b/gcc/fortran/trans-array.c index 803462a4..ac90a4b 100644 --- a/gcc/fortran/trans-array.c +++ b/gcc/fortran/trans-array.c @@ -2226,6 +2226,8 @@ trans_array_constructor (gfc_ss * ss, locus * where) gfc_ss_info *ss_info; gfc_expr *expr; gfc_ss *s; + tree neg_len; + char *msg; /* Save the old values for nested checking. */ old_first_len = first_len; @@ -2271,6 +2273,29 @@ trans_array_constructor (gfc_ss * ss, locus * where) gfc_conv_expr_type (&length_se, expr->ts.u.cl->length, gfc_charlen_type_node); ss_info->string_length = length_se.expr; + + /* Check if the character length is negative. If it is, then + set LEN = 0. */ + neg_len = fold_build2_loc (input_location, LT_EXPR, + boolean_type_node, ss_info->string_length, + build_int_cst (gfc_charlen_type_node, 0)); + /* Print a warning if bounds checking is enabled. */ + if (gfc_option.rtcheck & GFC_RTCHECK_BOUNDS) + { + msg = xasprintf ("Negative character length treated as LEN = 0"); + gfc_trans_runtime_check (false, true, neg_len, &length_se.pre, + where, msg); + free (msg); + } + + ss_info->string_length + = fold_build3_loc (input_location, COND_EXPR, + gfc_charlen_type_node, neg_len, + build_int_cst (gfc_charlen_type_node, 0), + ss_info->string_length); + ss_info->string_length = gfc_evaluate_now (ss_info->string_length, + &length_se.pre); + gfc_add_block_to_block (&outer_loop->pre, &length_se.pre); gfc_add_block_to_block (&outer_loop->post, &length_se.post); } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 321a48a..dcbdf56 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2016-12-01 Elizebeth Punnoose + + PR fortran/77505 + * gfortran.dg/char_length_20.f90: New test. + * gfortran.dg/char_length_21.f90: Ditto. + 2016-12-01 Steven G. Kargl PR fortran/78279 diff --git a/gcc/testsuite/gfortran.dg/char_length_20.f90 b/gcc/testsuite/gfortran.dg/char_length_20.f90 new file mode 100644 index 0000000..38a19c5 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/char_length_20.f90 @@ -0,0 +1,13 @@ +! { dg-do run } +! { dg-options "-fcheck=bounds" } +program rabbithole + implicit none + character(len=:), allocatable :: text_block(:) + integer i, ii + character(len=10) :: cten='abcdefghij' + character(len=20) :: ctwenty='abcdefghijabcdefghij' + ii = -6 + text_block=[ character(len=ii) :: cten, ctwenty ] + if (any(len_trim(text_block) /= 0)) call abort +end program rabbithole +! { dg-output "At line 10 of file .*char_length_20.f90.*Fortran runtime warning: Negative character length treated as LEN = 0" } diff --git a/gcc/testsuite/gfortran.dg/char_length_21.f90 b/gcc/testsuite/gfortran.dg/char_length_21.f90 new file mode 100644 index 0000000..76b7e8e --- /dev/null +++ b/gcc/testsuite/gfortran.dg/char_length_21.f90 @@ -0,0 +1,11 @@ +! { dg-do run } +program rabbithole + implicit none + character(len=:), allocatable :: text_block(:) + integer i, ii + character(len=10) :: cten='abcdefghij' + character(len=20) :: ctwenty='abcdefghijabcdefghij' + ii = -6 + text_block = [character(len=ii) :: cten, ctwenty] + if (any(len_trim(text_block) /= 0)) call abort +end program rabbithole -- cgit v1.1 From 0e81719703bf681533220f3629cc4f1a24110778 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 2 Dec 2016 00:15:57 +0100 Subject: re PR tree-optimization/78586 (Wrong code caused by printf-return-value) PR tree-optimization/78586 * gimple-ssa-sprintf.c (format_integer): Don't handle NOP_EXPR, CONVERT_EXPR or COMPONENT_REF here. Formatting fix. For SSA_NAME_DEF_STMT with NOP_EXPR only change argtype if the rhs1's type is INTEGER_TYPE or POINTER_TYPE. From-SVN: r243145 --- gcc/ChangeLog | 8 ++++++++ gcc/gimple-ssa-sprintf.c | 30 ++++++++++++------------------ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7f9dd0e..c3170c0 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2016-12-02 Jakub Jelinek + + PR tree-optimization/78586 + * gimple-ssa-sprintf.c (format_integer): Don't handle NOP_EXPR, + CONVERT_EXPR or COMPONENT_REF here. Formatting fix. For + SSA_NAME_DEF_STMT with NOP_EXPR only change argtype if the rhs1's + type is INTEGER_TYPE or POINTER_TYPE. + 2016-12-01 Kelvin Nilsen PR target/78577 diff --git a/gcc/gimple-ssa-sprintf.c b/gcc/gimple-ssa-sprintf.c index 99a635a..e86c4dc 100644 --- a/gcc/gimple-ssa-sprintf.c +++ b/gcc/gimple-ssa-sprintf.c @@ -968,24 +968,13 @@ format_integer (const conversion_spec &spec, tree arg) } else if (TREE_CODE (TREE_TYPE (arg)) == INTEGER_TYPE || TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE) - { - /* Determine the type of the provided non-constant argument. */ - if (TREE_CODE (arg) == NOP_EXPR) - arg = TREE_OPERAND (arg, 0); - else if (TREE_CODE (arg) == CONVERT_EXPR) - arg = TREE_OPERAND (arg, 0); - if (TREE_CODE (arg) == COMPONENT_REF) - arg = TREE_OPERAND (arg, 1); - - argtype = TREE_TYPE (arg); - } + /* Determine the type of the provided non-constant argument. */ + argtype = TREE_TYPE (arg); else - { - /* Don't bother with invalid arguments since they likely would - have already been diagnosed, and disable any further checking - of the format string by returning [-1, -1]. */ - return fmtresult (); - } + /* Don't bother with invalid arguments since they likely would + have already been diagnosed, and disable any further checking + of the format string by returning [-1, -1]. */ + return fmtresult (); fmtresult res; @@ -1059,7 +1048,12 @@ format_integer (const conversion_spec &spec, tree arg) } if (code == NOP_EXPR) - argtype = TREE_TYPE (gimple_assign_rhs1 (def)); + { + tree type = TREE_TYPE (gimple_assign_rhs1 (def)); + if (TREE_CODE (type) == INTEGER_TYPE + || TREE_CODE (type) == POINTER_TYPE) + argtype = type; + } } } } -- cgit v1.1 From f3adbf9e9355aff989bd5b0c6ba227cc01bf57ec Mon Sep 17 00:00:00 2001 From: GCC Administrator Date: Fri, 2 Dec 2016 00:16:20 +0000 Subject: Daily bump. From-SVN: r243150 --- gcc/DATESTAMP | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP index 73f27d5..b720a20 100644 --- a/gcc/DATESTAMP +++ b/gcc/DATESTAMP @@ -1 +1 @@ -20161201 +20161202 -- cgit v1.1 From b06496b1617ffcaec0e82fd4cca9eae5e0301cd5 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Thu, 1 Dec 2016 23:40:57 -0700 Subject: * tree-ssa-threadedge.c (record_temporary_equivalences_from_stmts_at_dest): Avoid temporary propagation of operands if there are no operands. From-SVN: r243152 --- gcc/ChangeLog | 6 ++++++ gcc/tree-ssa-threadedge.c | 7 ++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c3170c0..75881ee 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2016-12-01 Jeff Law + + * tree-ssa-threadedge.c + (record_temporary_equivalences_from_stmts_at_dest): Avoid temporary + propagation of operands if there are no operands. + 2016-12-02 Jakub Jelinek PR tree-optimization/78586 diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c index 534292c..3fdd59e 100644 --- a/gcc/tree-ssa-threadedge.c +++ b/gcc/tree-ssa-threadedge.c @@ -328,9 +328,10 @@ record_temporary_equivalences_from_stmts_at_dest (edge e, SSA_NAME_VALUE in addition to its own lattice. */ cached_lhs = gimple_fold_stmt_to_constant_1 (stmt, threadedge_valueize); - if (!cached_lhs - || (TREE_CODE (cached_lhs) != SSA_NAME - && !is_gimple_min_invariant (cached_lhs))) + if (NUM_SSA_OPERANDS (stmt, SSA_OP_ALL_USES) != 0 + && (!cached_lhs + || (TREE_CODE (cached_lhs) != SSA_NAME + && !is_gimple_min_invariant (cached_lhs)))) { /* We're going to temporarily copy propagate the operands and see if that allows us to simplify this statement. */ -- cgit v1.1 From 84b0769e335819050ecdd86301a5f5d41fa5df8b Mon Sep 17 00:00:00 2001 From: Maxim Ostapenko Date: Fri, 2 Dec 2016 07:39:27 +0000 Subject: Add support for ASan odr_indicator. config/ * bootstrap-asan.mk: Replace LSAN_OPTIONS=detect_leaks=0 with ASAN_OPTIONS=detect_leaks=0:use_odr_indicator=1. gcc/ * asan.c (asan_global_struct): Refactor. (create_odr_indicator): New function. (asan_needs_odr_indicator_p): Likewise. (is_odr_indicator): Likewise. (asan_add_global): Introduce odr_indicator_ptr. Pass it into global's constructor. (asan_protect_global): Do not protect odr indicators. gcc/c-family/ * c-attribs.c (asan odr indicator): New attribute. (handle_asan_odr_indicator_attribute): New function. gcc/testsuite/ * c-c++-common/asan/no-redundant-odr-indicators-1.c: New test. From-SVN: r243153 --- config/ChangeLog | 5 ++ config/bootstrap-asan.mk | 2 +- gcc/ChangeLog | 10 +++ gcc/asan.c | 87 ++++++++++++++++++++-- gcc/c-family/ChangeLog | 5 ++ gcc/c-family/c-attribs.c | 14 ++++ gcc/testsuite/ChangeLog | 4 + .../asan/no-redundant-odr-indicators-1.c | 17 +++++ 8 files changed, 135 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/asan/no-redundant-odr-indicators-1.c diff --git a/config/ChangeLog b/config/ChangeLog index 8dcb483..a823d21 100644 --- a/config/ChangeLog +++ b/config/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Maxim Ostapenko + + * bootstrap-asan.mk: Replace LSAN_OPTIONS=detect_leaks=0 with + ASAN_OPTIONS=detect_leaks=0:use_odr_indicator=1. + 2016-12-01 Matthias Klose * pkg.m4: Remove. diff --git a/config/bootstrap-asan.mk b/config/bootstrap-asan.mk index 70baaf9..e73d4c2 100644 --- a/config/bootstrap-asan.mk +++ b/config/bootstrap-asan.mk @@ -1,7 +1,7 @@ # This option enables -fsanitize=address for stage2 and stage3. # Suppress LeakSanitizer in bootstrap. -export LSAN_OPTIONS="detect_leaks=0" +export ASAN_OPTIONS=detect_leaks=0:use_odr_indicator=1 STAGE2_CFLAGS += -fsanitize=address STAGE3_CFLAGS += -fsanitize=address diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 75881ee..ef080c7 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2016-12-02 Maxim Ostapenko + + * asan.c (asan_global_struct): Refactor. + (create_odr_indicator): New function. + (asan_needs_odr_indicator_p): Likewise. + (is_odr_indicator): Likewise. + (asan_add_global): Introduce odr_indicator_ptr. Pass it into global's + constructor. + (asan_protect_global): Do not protect odr indicators. + 2016-12-01 Jeff Law * tree-ssa-threadedge.c diff --git a/gcc/asan.c b/gcc/asan.c index cb5d615..5af9547 100644 --- a/gcc/asan.c +++ b/gcc/asan.c @@ -1388,6 +1388,16 @@ asan_needs_local_alias (tree decl) return DECL_WEAK (decl) || !targetm.binds_local_p (decl); } +/* Return true if DECL, a global var, is an artificial ODR indicator symbol + therefore doesn't need protection. */ + +static bool +is_odr_indicator (tree decl) +{ + return (DECL_ARTIFICIAL (decl) + && lookup_attribute ("asan odr indicator", DECL_ATTRIBUTES (decl))); +} + /* Return true if DECL is a VAR_DECL that should be protected by Address Sanitizer, by appending a red zone with protected shadow memory after it and aligning it to at least @@ -1436,7 +1446,8 @@ asan_protect_global (tree decl) || ASAN_RED_ZONE_SIZE * BITS_PER_UNIT > MAX_OFILE_ALIGNMENT || !valid_constant_size_p (DECL_SIZE_UNIT (decl)) || DECL_ALIGN_UNIT (decl) > 2 * ASAN_RED_ZONE_SIZE - || TREE_TYPE (decl) == ubsan_get_source_location_type ()) + || TREE_TYPE (decl) == ubsan_get_source_location_type () + || is_odr_indicator (decl)) return false; rtl = DECL_RTL (decl); @@ -2266,14 +2277,15 @@ asan_dynamic_init_call (bool after_p) static tree asan_global_struct (void) { - static const char *field_names[8] + static const char *field_names[] = { "__beg", "__size", "__size_with_redzone", - "__name", "__module_name", "__has_dynamic_init", "__location", "__odr_indicator"}; - tree fields[8], ret; - int i; + "__name", "__module_name", "__has_dynamic_init", "__location", + "__odr_indicator" }; + tree fields[ARRAY_SIZE (field_names)], ret; + unsigned i; ret = make_node (RECORD_TYPE); - for (i = 0; i < 8; i++) + for (i = 0; i < ARRAY_SIZE (field_names); i++) { fields[i] = build_decl (UNKNOWN_LOCATION, FIELD_DECL, @@ -2295,6 +2307,63 @@ asan_global_struct (void) return ret; } +/* Create and return odr indicator symbol for DECL. + TYPE is __asan_global struct type as returned by asan_global_struct. */ + +static tree +create_odr_indicator (tree decl, tree type) +{ + char *name; + tree uptr = TREE_TYPE (DECL_CHAIN (TYPE_FIELDS (type))); + tree decl_name + = (HAS_DECL_ASSEMBLER_NAME_P (decl) ? DECL_ASSEMBLER_NAME (decl) + : DECL_NAME (decl)); + /* DECL_NAME theoretically might be NULL. Bail out with 0 in this case. */ + if (decl_name == NULL_TREE) + return build_int_cst (uptr, 0); + size_t len = strlen (IDENTIFIER_POINTER (decl_name)) + sizeof ("__odr_asan_"); + name = XALLOCAVEC (char, len); + snprintf (name, len, "__odr_asan_%s", IDENTIFIER_POINTER (decl_name)); +#ifndef NO_DOT_IN_LABEL + name[sizeof ("__odr_asan") - 1] = '.'; +#elif !defined(NO_DOLLAR_IN_LABEL) + name[sizeof ("__odr_asan") - 1] = '$'; +#endif + tree var = build_decl (UNKNOWN_LOCATION, VAR_DECL, get_identifier (name), + char_type_node); + TREE_ADDRESSABLE (var) = 1; + TREE_READONLY (var) = 0; + TREE_THIS_VOLATILE (var) = 1; + DECL_GIMPLE_REG_P (var) = 0; + DECL_ARTIFICIAL (var) = 1; + DECL_IGNORED_P (var) = 1; + TREE_STATIC (var) = 1; + TREE_PUBLIC (var) = 1; + DECL_VISIBILITY (var) = DECL_VISIBILITY (decl); + DECL_VISIBILITY_SPECIFIED (var) = DECL_VISIBILITY_SPECIFIED (decl); + + TREE_USED (var) = 1; + tree ctor = build_constructor_va (TREE_TYPE (var), 1, NULL_TREE, + build_int_cst (unsigned_type_node, 0)); + TREE_CONSTANT (ctor) = 1; + TREE_STATIC (ctor) = 1; + DECL_INITIAL (var) = ctor; + DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("asan odr indicator"), + NULL, DECL_ATTRIBUTES (var)); + make_decl_rtl (var); + varpool_node::finalize_decl (var); + return fold_convert (uptr, build_fold_addr_expr (var)); +} + +/* Return true if DECL, a global var, might be overridden and needs + an additional odr indicator symbol. */ + +static bool +asan_needs_odr_indicator_p (tree decl) +{ + return !DECL_ARTIFICIAL (decl) && !DECL_WEAK (decl) && TREE_PUBLIC (decl); +} + /* Append description of a single global DECL into vector V. TYPE is __asan_global struct type as returned by asan_global_struct. */ @@ -2335,6 +2404,9 @@ asan_add_global (tree decl, tree type, vec *v) assemble_alias (refdecl, DECL_ASSEMBLER_NAME (decl)); } + tree odr_indicator_ptr + = (asan_needs_odr_indicator_p (decl) ? create_odr_indicator (decl, type) + : build_int_cst (uptr, 0)); CONSTRUCTOR_APPEND_ELT (vinner, NULL_TREE, fold_convert (const_ptr_type_node, build_fold_addr_expr (refdecl))); @@ -2382,8 +2454,7 @@ asan_add_global (tree decl, tree type, vec *v) else locptr = build_int_cst (uptr, 0); CONSTRUCTOR_APPEND_ELT (vinner, NULL_TREE, locptr); - /* TODO: support ODR indicators. */ - CONSTRUCTOR_APPEND_ELT (vinner, NULL_TREE, build_int_cst (uptr, 0)); + CONSTRUCTOR_APPEND_ELT (vinner, NULL_TREE, odr_indicator_ptr); init = build_constructor (type, vinner); CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init); } diff --git a/gcc/c-family/ChangeLog b/gcc/c-family/ChangeLog index 183493d..5890798 100644 --- a/gcc/c-family/ChangeLog +++ b/gcc/c-family/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Maxim Ostapenko + + * c-attribs.c (asan odr indicator): New attribute. + (handle_asan_odr_indicator_attribute): New function. + 2016-11-26 Prathamesh Kulkarni * c-common.c (c_common_nodes_and_builtins): Remove initialization of diff --git a/gcc/c-family/c-attribs.c b/gcc/c-family/c-attribs.c index 964efe9..f5adade 100644 --- a/gcc/c-family/c-attribs.c +++ b/gcc/c-family/c-attribs.c @@ -57,6 +57,8 @@ static tree handle_no_address_safety_analysis_attribute (tree *, tree, tree, int, bool *); static tree handle_no_sanitize_undefined_attribute (tree *, tree, tree, int, bool *); +static tree handle_asan_odr_indicator_attribute (tree *, tree, tree, int, + bool *); static tree handle_stack_protect_attribute (tree *, tree, tree, int, bool *); static tree handle_noinline_attribute (tree *, tree, tree, int, bool *); static tree handle_noclone_attribute (tree *, tree, tree, int, bool *); @@ -292,6 +294,9 @@ const struct attribute_spec c_common_attribute_table[] = { "no_sanitize_undefined", 0, 0, true, false, false, handle_no_sanitize_undefined_attribute, false }, + { "asan odr indicator", 0, 0, true, false, false, + handle_asan_odr_indicator_attribute, + false }, { "warning", 1, 1, true, false, false, handle_error_attribute, false }, { "error", 1, 1, true, false, false, @@ -591,6 +596,15 @@ handle_no_sanitize_undefined_attribute (tree *node, tree name, tree, int, return NULL_TREE; } +/* Handle an "asan odr indicator" attribute; arguments as in + struct attribute_spec.handler. */ + +static tree +handle_asan_odr_indicator_attribute (tree *, tree, tree, int, bool *) +{ + return NULL_TREE; +} + /* Handle a "stack_protect" attribute; arguments as in struct attribute_spec.handler. */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index dcbdf56..c000e07 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2016-12-02 Maxim Ostapenko + + * c-c++-common/asan/no-redundant-odr-indicators-1.c: New test. + 2016-12-01 Elizebeth Punnoose PR fortran/77505 diff --git a/gcc/testsuite/c-c++-common/asan/no-redundant-odr-indicators-1.c b/gcc/testsuite/c-c++-common/asan/no-redundant-odr-indicators-1.c new file mode 100644 index 0000000..9231264 --- /dev/null +++ b/gcc/testsuite/c-c++-common/asan/no-redundant-odr-indicators-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { *-*-* } { "*" } { "-O0" } } */ + +/* Local variables should not have odr indicators. */ +static int a = 2; +/* Thread local variables should not have odr indicators. */ +__thread int b = 3; +/* Externally visible variables should have odr indicators. */ +int c = 1; + +int main () { + return 0; +} + +/* { dg-final { scan-assembler-not "odr_asan\[\.\$\]a" } } */ +/* { dg-final { scan-assembler-not "odr_asan\[\.\$\]b" } } */ +/* { dg-final { scan-assembler "odr_asan\[\.\$\]c" } } */ -- cgit v1.1 From a6a2b532f9c4e92277e390febc8c07f773becb1b Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Fri, 2 Dec 2016 08:21:43 +0000 Subject: S/390: Fix vector all/any cc modes. This fixes a problem with the vector compares producing CC mode results. The instructions produce condition code modes which can be either interpreted to check an ALL elements or an ANY element result. As the modes where used before they could not be inverted by the middle-end by inverting the comparison code (e.g. eq to ne). The result usually was just wrong. In fact inverting a comparison code on an CCVALL mode would require to also change the mode to CCVANY but this cannot be done easily in the middle-end. With this patch the meaning of an ALL cc mode only refers to the not-inverted comparison code (e.g. eq, gt, ge). With that change inverting the comparison code matches a not operation on the condition code mask again. Bootstrapped and regression tested on s390 and s390x. Bye, -Andreas- gcc/testsuite/ChangeLog: 2016-12-02 Andreas Krebbel * gcc.target/s390/vector/vec-scalar-cmp-1.c: Fix and harden the pattern checks. * gcc.target/s390/zvector/vec-cmp-1.c: New test. gcc/ChangeLog: 2016-12-02 Andreas Krebbel * config/s390/s390-modes.def (CCVEQANY, CCVH, CCVHANY, CCVHU) (CCVHUANY): Remove modes. (CCVIH, CCVIHU, CCVIALL, CCVIANY, CCVFALL, CCVFANY): Add modes and documentation. * config/s390/s390.c (s390_match_ccmode_set): Rename cc modes. (s390_expand_vec_compare_scalar): Pick one of the cc consumer modes. (s390_branch_condition_mask): Adjust to use the new cc consumer modes. The new modes allow for proper reversal in the middle-end. (s390_expand_vec_compare_cc): Determine the proper cc producer and consumer modes for a comparison. * config/s390/s390.md: Rename CCVH to CCVIH and CCVHU to CCVIHU throughout the file. * config/s390/vx-builtins.md: Likewise. From-SVN: r243154 --- gcc/ChangeLog | 17 ++ gcc/config/s390/s390-modes.def | 72 ++++--- gcc/config/s390/s390.c | 226 +++++++++++---------- gcc/config/s390/s390.md | 2 +- gcc/config/s390/vx-builtins.md | 44 ++-- gcc/testsuite/ChangeLog | 6 + .../gcc.target/s390/vector/vec-scalar-cmp-1.c | 24 ++- gcc/testsuite/gcc.target/s390/zvector/vec-cmp-1.c | 173 ++++++++++++++++ 8 files changed, 388 insertions(+), 176 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/zvector/vec-cmp-1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ef080c7..2d55409 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,20 @@ +2016-12-02 Andreas Krebbel + + * config/s390/s390-modes.def (CCVEQANY, CCVH, CCVHANY, CCVHU) + (CCVHUANY): Remove modes. + (CCVIH, CCVIHU, CCVIALL, CCVIANY, CCVFALL, CCVFANY): Add modes and + documentation. + * config/s390/s390.c (s390_match_ccmode_set): Rename cc modes. + (s390_expand_vec_compare_scalar): Pick one of the cc consumer + modes. + (s390_branch_condition_mask): Adjust to use the new cc consumer + modes. The new modes allow for proper reversal in the middle-end. + (s390_expand_vec_compare_cc): Determine the proper cc producer and + consumer modes for a comparison. + * config/s390/s390.md: Rename CCVH to CCVIH and CCVHU to CCVIHU + throughout the file. + * config/s390/vx-builtins.md: Likewise. + 2016-12-02 Maxim Ostapenko * asan.c (asan_global_struct): Refactor. diff --git a/gcc/config/s390/s390-modes.def b/gcc/config/s390/s390-modes.def index 69235b6..15ff903 100644 --- a/gcc/config/s390/s390-modes.def +++ b/gcc/config/s390/s390-modes.def @@ -84,22 +84,6 @@ Requested mode -> Destination CC register mode CCS, CCU, CCT, CCSR, CCUR -> CCZ CCA -> CCAP, CCAN -Vector comparison modes - -CCVEQ EQ - - NE (VCEQ) -CCVEQANY EQ EQ - NE (VCEQ) - -CCVH GT - - LE (VCH) -CCVHANY GT GT - LE (VCH) -CCVHU GTU - - LEU (VCHL) -CCVHUANY GTU GTU - LEU (VCHL) - -CCVFH GT - - UNLE (VFCH) -CCVFHANY GT GT - UNLE (VFCH) -CCVFHE GE - - UNLT (VFCHE) -CCVFHEANY GE GE - UNLT (VFCHE) - - *** Comments *** @@ -169,14 +153,40 @@ The compare and swap instructions sets the condition code to 0/1 if the operands were equal/unequal. The CCZ1 mode ensures the result can be effectively placed into a register. - -CCV* - -The variants with and without ANY are generated by the same -instructions and therefore are holding the same information. However, -when generating a condition code mask they require checking different -bits of CC. In that case the variants without ANY represent the -results for *all* elements. +CCVIH, CCVIHU, CCVFH, CCVFHE + +These are condition code modes used in instructions setting the +condition code. The mode determines which comparison to perform (H - +high, HU - high unsigned, HE - high or equal) and whether it is a +floating point comparison or not (I - int, F - float). + +The comparison operation to be performed needs to be encoded into the +condition code mode since the comparison operator is not available in +compare style patterns (set cc (compare (op0) (op1))). So the +condition code mode is the only information to determine the +instruction to be used. + +CCVIALL, CCVIANY, CCVFALL, CCVFANY + +These modes are used in instructions reading the condition code. +Opposed to the CC producer patterns the comparison operator is +available. Hence the comparison operation does not need to be part of +the CC mode. However, we still need to know whether CC has been +generated by a float or an integer comparison in order to be able to +invert the condition correctly (int: GT -> LE, float: GT -> UNLE). + +The ALL and ANY variants differ only in the usage of CC1 which +indicates a mixed result across the vector elements. Be aware that +depending on the comparison code the ALL and ANY variants might +actually refer to their opposite meaning. I.e. while inverting the +comparison in (EQ (reg:CCVIALL 33) (const_int 0)) results in (NE +(reg:CCVIALL 33) (const_int 0)) it in fact describes an ANY comparison +(inverting "all equal" should be "any not equal") However, the +middle-end does invert only the comparison operator without touching +the mode. +Hence, the ALL/ANY in the mode names refer to the meaning in the +context of EQ, GT, GE while for the inverted codes it actually means +ANY/ALL. CCRAW @@ -209,18 +219,18 @@ CC_MODE (CCT3); CC_MODE (CCRAW); CC_MODE (CCVEQ); -CC_MODE (CCVEQANY); -CC_MODE (CCVH); -CC_MODE (CCVHANY); -CC_MODE (CCVHU); -CC_MODE (CCVHUANY); +CC_MODE (CCVIH); +CC_MODE (CCVIHU); CC_MODE (CCVFH); -CC_MODE (CCVFHANY); CC_MODE (CCVFHE); -CC_MODE (CCVFHEANY); +CC_MODE (CCVIALL); +CC_MODE (CCVIANY); + +CC_MODE (CCVFALL); +CC_MODE (CCVFANY); /* Vector modes. */ diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index dc82fb6..445c147 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -1275,6 +1275,11 @@ s390_match_ccmode_set (rtx set, machine_mode req_mode) gcc_assert (GET_CODE (set) == SET); + /* These modes are supposed to be used only in CC consumer + patterns. */ + gcc_assert (req_mode != CCVIALLmode && req_mode != CCVIANYmode + && req_mode != CCVFALLmode && req_mode != CCVFANYmode); + if (GET_CODE (SET_DEST (set)) != REG || !CC_REGNO_P (REGNO (SET_DEST (set)))) return 1; @@ -1293,8 +1298,8 @@ s390_match_ccmode_set (rtx set, machine_mode req_mode) case CCT2mode: case CCT3mode: case CCVEQmode: - case CCVHmode: - case CCVHUmode: + case CCVIHmode: + case CCVIHUmode: case CCVFHmode: case CCVFHEmode: if (req_mode != set_mode) @@ -1752,14 +1757,20 @@ s390_expand_vec_compare_scalar (enum rtx_code *code, rtx cmp1, rtx cmp2, cmp2 = cmp1; cmp1 = tmp; } - *cc = gen_rtx_REG (cmp_mode, CC_REGNUM); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, - gen_rtx_SET (*cc, + gen_rtx_SET (gen_rtx_REG (cmp_mode, CC_REGNUM), gen_rtx_COMPARE (cmp_mode, cmp1, cmp2)), gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (V2DImode))))); + + /* This is the cc reg how it will be used in the cc mode consumer. + It either needs to be CCVFALL or CCVFANY. However, CC1 will + never be set by the scalar variants. So it actually doesn't + matter which one we choose here. */ + *cc = gen_rtx_REG (CCVFALLmode, CC_REGNUM); return true; } @@ -2021,92 +2032,63 @@ s390_branch_condition_mask (rtx code) break; /* Vector comparison modes. */ - - case CCVEQmode: - switch (GET_CODE (code)) - { - case EQ: return CC0; - case NE: return CC3; - default: return -1; - } - - case CCVEQANYmode: - switch (GET_CODE (code)) - { - case EQ: return CC0 | CC1; - case NE: return CC3 | CC1; - default: return -1; - } - - /* Integer vector compare modes. */ - - case CCVHmode: - switch (GET_CODE (code)) - { - case GT: return CC0; - case LE: return CC3; - default: return -1; - } - - case CCVHANYmode: - switch (GET_CODE (code)) - { - case GT: return CC0 | CC1; - case LE: return CC3 | CC1; - default: return -1; - } - - case CCVHUmode: - switch (GET_CODE (code)) - { - case GTU: return CC0; - case LEU: return CC3; - default: return -1; - } - - case CCVHUANYmode: - switch (GET_CODE (code)) - { - case GTU: return CC0 | CC1; - case LEU: return CC3 | CC1; - default: return -1; - } - - /* FP vector compare modes. */ - - case CCVFHmode: + /* CC2 will never be set. It however is part of the negated + masks. */ + case CCVIALLmode: switch (GET_CODE (code)) { - case GT: return CC0; - case UNLE: return CC3; + case EQ: + case GTU: + case GT: + case GE: return CC0; + /* The inverted modes are in fact *any* modes. */ + case NE: + case LEU: + case LE: + case LT: return CC3 | CC1 | CC2; default: return -1; } - case CCVFHANYmode: + case CCVIANYmode: switch (GET_CODE (code)) { - case GT: return CC0 | CC1; - case UNLE: return CC3 | CC1; + case EQ: + case GTU: + case GT: + case GE: return CC0 | CC1; + /* The inverted modes are in fact *all* modes. */ + case NE: + case LEU: + case LE: + case LT: return CC3 | CC2; default: return -1; } - - case CCVFHEmode: + case CCVFALLmode: switch (GET_CODE (code)) { + case EQ: + case GT: case GE: return CC0; - case UNLT: return CC3; + /* The inverted modes are in fact *any* modes. */ + case NE: + case UNLE: + case UNLT: return CC3 | CC1 | CC2; default: return -1; } - case CCVFHEANYmode: + case CCVFANYmode: switch (GET_CODE (code)) { + case EQ: + case GT: case GE: return CC0 | CC1; - case UNLT: return CC3 | CC1; + /* The inverted modes are in fact *all* modes. */ + case NE: + case UNLE: + case UNLT: return CC3 | CC2; default: return -1; } - case CCRAWmode: switch (GET_CODE (code)) { @@ -6256,13 +6238,15 @@ s390_expand_vec_compare (rtx target, enum rtx_code cond, /* Expand the comparison CODE of CMP1 and CMP2 and copy 1 or 0 into TARGET if either all (ALL_P is true) or any (ALL_P is false) of the - elements in CMP1 and CMP2 fulfill the comparison. */ + elements in CMP1 and CMP2 fulfill the comparison. + This function is only used to emit patterns for the vx builtins and + therefore only handles comparison codes required by the + builtins. */ void s390_expand_vec_compare_cc (rtx target, enum rtx_code code, rtx cmp1, rtx cmp2, bool all_p) { - enum rtx_code new_code = code; - machine_mode cmp_mode, full_cmp_mode, scratch_mode; + machine_mode cc_producer_mode, cc_consumer_mode, scratch_mode; rtx tmp_reg = gen_reg_rtx (SImode); bool swap_p = false; @@ -6270,53 +6254,71 @@ s390_expand_vec_compare_cc (rtx target, enum rtx_code code, { switch (code) { - case EQ: cmp_mode = CCVEQmode; break; - case NE: cmp_mode = CCVEQmode; break; - case GT: cmp_mode = CCVHmode; break; - case GE: cmp_mode = CCVHmode; new_code = LE; swap_p = true; break; - case LT: cmp_mode = CCVHmode; new_code = GT; swap_p = true; break; - case LE: cmp_mode = CCVHmode; new_code = LE; break; - case GTU: cmp_mode = CCVHUmode; break; - case GEU: cmp_mode = CCVHUmode; new_code = LEU; swap_p = true; break; - case LTU: cmp_mode = CCVHUmode; new_code = GTU; swap_p = true; break; - case LEU: cmp_mode = CCVHUmode; new_code = LEU; break; - default: gcc_unreachable (); + case EQ: + case NE: + cc_producer_mode = CCVEQmode; + break; + case GE: + case LT: + code = swap_condition (code); + swap_p = true; + /* fallthrough */ + case GT: + case LE: + cc_producer_mode = CCVIHmode; + break; + case GEU: + case LTU: + code = swap_condition (code); + swap_p = true; + /* fallthrough */ + case GTU: + case LEU: + cc_producer_mode = CCVIHUmode; + break; + default: + gcc_unreachable (); } + scratch_mode = GET_MODE (cmp1); + /* These codes represent inverted CC interpretations. Inverting + an ALL CC mode results in an ANY CC mode and the other way + around. Invert the all_p flag here to compensate for + that. */ + if (code == NE || code == LE || code == LEU) + all_p = !all_p; + + cc_consumer_mode = all_p ? CCVIALLmode : CCVIANYmode; } - else if (GET_MODE (cmp1) == V2DFmode) + else if (GET_MODE_CLASS (GET_MODE (cmp1)) == MODE_VECTOR_FLOAT) { + bool inv_p = false; + switch (code) { - case EQ: cmp_mode = CCVEQmode; break; - case NE: cmp_mode = CCVEQmode; break; - case GT: cmp_mode = CCVFHmode; break; - case GE: cmp_mode = CCVFHEmode; break; - case UNLE: cmp_mode = CCVFHmode; break; - case UNLT: cmp_mode = CCVFHEmode; break; - case LT: cmp_mode = CCVFHmode; new_code = GT; swap_p = true; break; - case LE: cmp_mode = CCVFHEmode; new_code = GE; swap_p = true; break; + case EQ: cc_producer_mode = CCVEQmode; break; + case NE: cc_producer_mode = CCVEQmode; inv_p = true; break; + case GT: cc_producer_mode = CCVFHmode; break; + case GE: cc_producer_mode = CCVFHEmode; break; + case UNLE: cc_producer_mode = CCVFHmode; inv_p = true; break; + case UNLT: cc_producer_mode = CCVFHEmode; inv_p = true; break; + case LT: cc_producer_mode = CCVFHmode; code = GT; swap_p = true; break; + case LE: cc_producer_mode = CCVFHEmode; code = GE; swap_p = true; break; default: gcc_unreachable (); } - scratch_mode = V2DImode; + scratch_mode = mode_for_vector ( + int_mode_for_mode (GET_MODE_INNER (GET_MODE (cmp1))), + GET_MODE_NUNITS (GET_MODE (cmp1))); + gcc_assert (scratch_mode != BLKmode); + + if (inv_p) + all_p = !all_p; + + cc_consumer_mode = all_p ? CCVFALLmode : CCVFANYmode; } else gcc_unreachable (); - if (!all_p) - switch (cmp_mode) - { - case CCVEQmode: full_cmp_mode = CCVEQANYmode; break; - case CCVHmode: full_cmp_mode = CCVHANYmode; break; - case CCVHUmode: full_cmp_mode = CCVHUANYmode; break; - case CCVFHmode: full_cmp_mode = CCVFHANYmode; break; - case CCVFHEmode: full_cmp_mode = CCVFHEANYmode; break; - default: gcc_unreachable (); - } - else - /* The modes without ANY match the ALL modes. */ - full_cmp_mode = cmp_mode; - if (swap_p) { rtx tmp = cmp2; @@ -6326,8 +6328,8 @@ s390_expand_vec_compare_cc (rtx target, enum rtx_code code, emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, gen_rtx_SET ( - gen_rtx_REG (cmp_mode, CC_REGNUM), - gen_rtx_COMPARE (cmp_mode, cmp1, cmp2)), + gen_rtx_REG (cc_producer_mode, CC_REGNUM), + gen_rtx_COMPARE (cc_producer_mode, cmp1, cmp2)), gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (scratch_mode))))); emit_move_insn (target, const0_rtx); @@ -6335,10 +6337,10 @@ s390_expand_vec_compare_cc (rtx target, enum rtx_code code, emit_move_insn (target, gen_rtx_IF_THEN_ELSE (SImode, - gen_rtx_fmt_ee (new_code, VOIDmode, - gen_rtx_REG (full_cmp_mode, CC_REGNUM), + gen_rtx_fmt_ee (code, VOIDmode, + gen_rtx_REG (cc_consumer_mode, CC_REGNUM), const0_rtx), - target, tmp_reg)); + tmp_reg, target)); } /* Generate a vector comparison expression loading either elements of diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index a449b03..5844e28 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -782,7 +782,7 @@ ; Used with VFCMP to expand part of the mnemonic ; For fp we have a mismatch: eq in the insn name - e in asm (define_mode_attr asm_fcmp [(CCVEQ "e") (CCVFH "h") (CCVFHE "he")]) -(define_mode_attr insn_cmp [(CCVEQ "eq") (CCVH "h") (CCVHU "hl") (CCVFH "h") (CCVFHE "he")]) +(define_mode_attr insn_cmp [(CCVEQ "eq") (CCVIH "h") (CCVIHU "hl") (CCVFH "h") (CCVFHE "he")]) ;; Subst pattern definitions (include "subst.md") diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md index c6ac44c..51d022c 100644 --- a/gcc/config/s390/vx-builtins.md +++ b/gcc/config/s390/vx-builtins.md @@ -36,7 +36,7 @@ (V1DF "DI") (V2DF "DI")]) ; Condition code modes generated by int comparisons -(define_mode_iterator VICMP [CCVEQ CCVH CCVHU]) +(define_mode_iterator VICMP [CCVEQ CCVIH CCVIHU]) ; Comparisons supported by the vec_cmp* builtins (define_code_iterator intcmp [eq gt gtu ge geu lt ltu le leu]) @@ -1900,24 +1900,24 @@ (define_expand "vec_cmph_cc" [(parallel - [(set (reg:CCVH CC_REGNUM) - (compare:CCVH (match_operand:VI_HW 1 "register_operand" "v") - (match_operand:VI_HW 2 "register_operand" "v"))) + [(set (reg:CCVIH CC_REGNUM) + (compare:CCVIH (match_operand:VI_HW 1 "register_operand" "v") + (match_operand:VI_HW 2 "register_operand" "v"))) (set (match_operand:VI_HW 0 "register_operand" "=v") (gt:VI_HW (match_dup 1) (match_dup 2)))]) (set (match_operand:SI 3 "memory_operand" "") - (unspec:SI [(reg:CCVH CC_REGNUM)] UNSPEC_CC_TO_INT))] + (unspec:SI [(reg:CCVIH CC_REGNUM)] UNSPEC_CC_TO_INT))] "TARGET_VX") (define_expand "vec_cmphl_cc" [(parallel - [(set (reg:CCVHU CC_REGNUM) - (compare:CCVHU (match_operand:VI_HW 1 "register_operand" "v") - (match_operand:VI_HW 2 "register_operand" "v"))) + [(set (reg:CCVIHU CC_REGNUM) + (compare:CCVIHU (match_operand:VI_HW 1 "register_operand" "v") + (match_operand:VI_HW 2 "register_operand" "v"))) (set (match_operand:VI_HW 0 "register_operand" "=v") (gtu:VI_HW (match_dup 1) (match_dup 2)))]) (set (match_operand:SI 3 "memory_operand" "") - (unspec:SI [(reg:CCVHU CC_REGNUM)] UNSPEC_CC_TO_INT))] + (unspec:SI [(reg:CCVIHU CC_REGNUM)] UNSPEC_CC_TO_INT))] "TARGET_VX") @@ -1932,9 +1932,9 @@ [(set_attr "op_type" "VRR")]) (define_insn "*vec_cmph_cc" - [(set (reg:CCVH CC_REGNUM) - (compare:CCVH (match_operand:VI_HW 0 "register_operand" "v") - (match_operand:VI_HW 1 "register_operand" "v"))) + [(set (reg:CCVIH CC_REGNUM) + (compare:CCVIH (match_operand:VI_HW 0 "register_operand" "v") + (match_operand:VI_HW 1 "register_operand" "v"))) (set (match_operand:VI_HW 2 "register_operand" "=v") (gt:VI_HW (match_dup 0) (match_dup 1)))] "TARGET_VX" @@ -1942,9 +1942,9 @@ [(set_attr "op_type" "VRR")]) (define_insn "*vec_cmphl_cc" - [(set (reg:CCVHU CC_REGNUM) - (compare:CCVHU (match_operand:VI_HW 0 "register_operand" "v") - (match_operand:VI_HW 1 "register_operand" "v"))) + [(set (reg:CCVIHU CC_REGNUM) + (compare:CCVIHU (match_operand:VI_HW 0 "register_operand" "v") + (match_operand:VI_HW 1 "register_operand" "v"))) (set (match_operand:VI_HW 2 "register_operand" "=v") (gtu:VI_HW (match_dup 0) (match_dup 1)))] "TARGET_VX" @@ -1978,13 +1978,13 @@ (define_expand "vec_cmphv2df_cc" [(parallel - [(set (reg:CCVH CC_REGNUM) - (compare:CCVH (match_operand:V2DF 1 "register_operand" "v") - (match_operand:V2DF 2 "register_operand" "v"))) + [(set (reg:CCVIH CC_REGNUM) + (compare:CCVIH (match_operand:V2DF 1 "register_operand" "v") + (match_operand:V2DF 2 "register_operand" "v"))) (set (match_operand:V2DI 0 "register_operand" "=v") (gt:V2DI (match_dup 1) (match_dup 2)))]) (set (match_operand:SI 3 "memory_operand" "") - (unspec:SI [(reg:CCVH CC_REGNUM)] UNSPEC_CC_TO_INT))] + (unspec:SI [(reg:CCVIH CC_REGNUM)] UNSPEC_CC_TO_INT))] "TARGET_VX") (define_expand "vec_cmphev2df_cc" @@ -2010,9 +2010,9 @@ [(set_attr "op_type" "VRR")]) (define_insn "*vec_cmphv2df_cc" - [(set (reg:CCVH CC_REGNUM) - (compare:CCVH (match_operand:V2DF 0 "register_operand" "v") - (match_operand:V2DF 1 "register_operand" "v"))) + [(set (reg:CCVIH CC_REGNUM) + (compare:CCVIH (match_operand:V2DF 0 "register_operand" "v") + (match_operand:V2DF 1 "register_operand" "v"))) (set (match_operand:V2DI 2 "register_operand" "=v") (gt:V2DI (match_dup 0) (match_dup 1)))] "TARGET_VX" diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index c000e07..2ecf8f9 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2016-12-02 Andreas Krebbel + + * gcc.target/s390/vector/vec-scalar-cmp-1.c: Fix and harden the + pattern checks. + * gcc.target/s390/zvector/vec-cmp-1.c: New test. + 2016-12-02 Maxim Ostapenko * c-c++-common/asan/no-redundant-odr-indicators-1.c: New test. diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-scalar-cmp-1.c b/gcc/testsuite/gcc.target/s390/vector/vec-scalar-cmp-1.c index 5f63eda..46a261f 100644 --- a/gcc/testsuite/gcc.target/s390/vector/vec-scalar-cmp-1.c +++ b/gcc/testsuite/gcc.target/s390/vector/vec-scalar-cmp-1.c @@ -1,16 +1,7 @@ /* Check that we use the scalar variants of vector compares. */ /* { dg-do compile { target { s390*-*-* } } } */ -/* { dg-options "-O3 -mzarch -march=z13" } */ - -/* { dg-final { scan-assembler-times "wfcedbs\t%v\[0-9\]*,%v0,%v2" 2 } } */ -/* { dg-final { scan-assembler-times "wfchdbs\t%v\[0-9\]*,%v0,%v2" 1 } } */ -/* { dg-final { scan-assembler-times "wfchedbs\t%v\[0-9\]*,%v2,%v0" 1 } } */ -/* { dg-final { scan-assembler-times "wfchdbs\t%v\[0-9\]*,%v2,%v0" 1 } } */ -/* { dg-final { scan-assembler-times "wfchedbs\t%v\[0-9\]*,%v2,%v0" 1 } } */ -/* { dg-final { scan-assembler-times "lochine" 5 } } */ -/* { dg-final { scan-assembler-times "lochino" 1 } } */ - +/* { dg-options "-O3 -mzarch -march=z13 -fno-asynchronous-unwind-tables" } */ int eq (double a, double b) @@ -18,32 +9,45 @@ eq (double a, double b) return a == b; } +/* { dg-final { scan-assembler "eq:\n\twfcedbs\t%v\[0-9\]*,%v0,%v2\n\tlhi\t%r2,1\n\tlochine\t%r2,0" } } */ + int ne (double a, double b) { return a != b; } +/* { dg-final { scan-assembler "ne:\n\twfcedbs\t%v\[0-9\]*,%v0,%v2\n\tlhi\t%r2,1\n\tlochie\t%r2,0" } } */ + int gt (double a, double b) { return a > b; } +/* { dg-final { scan-assembler "gt:\n\twfchdbs\t%v\[0-9\]*,%v0,%v2\n\tlhi\t%r2,1\n\tlochine\t%r2,0" } } */ + int ge (double a, double b) { return a >= b; } +/* { dg-final { scan-assembler "ge:\n\twfchedbs\t%v\[0-9\]*,%v0,%v2\n\tlhi\t%r2,1\n\tlochine\t%r2,0" } } */ + int lt (double a, double b) { return a < b; } +/* { dg-final { scan-assembler "lt:\n\twfchdbs\t%v\[0-9\]*,%v2,%v0\n\tlhi\t%r2,1\n\tlochine\t%r2,0" } } */ + int le (double a, double b) { return a <= b; } + +/* { dg-final { scan-assembler "le:\n\twfchedbs\t%v\[0-9\]*,%v2,%v0\n\tlhi\t%r2,1\n\tlochine\t%r2,0" } } */ + diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec-cmp-1.c b/gcc/testsuite/gcc.target/s390/zvector/vec-cmp-1.c new file mode 100644 index 0000000..58bc39f --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/zvector/vec-cmp-1.c @@ -0,0 +1,173 @@ +/* { dg-do compile { target { s390*-*-* } } } */ +/* { dg-options "-O3 -mzarch -march=z13 -mzvector -fno-asynchronous-unwind-tables" } */ + +#include + +int __attribute__((noinline,noclone)) +all_eq_double (vector double a, vector double b) +{ + return vec_all_eq (a, b); +} +/* { dg-final { scan-assembler-times all_eq_double:\n\tvfcedbs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochine\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_ne_double (vector double a, vector double b) +{ + return vec_all_ne (a, b); +} +/* { dg-final { scan-assembler-times all_ne_double:\n\tvfcedbs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochile\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_gt_double (vector double a, vector double b) +{ + return vec_all_gt (a, b); +} +/* { dg-final { scan-assembler-times all_gt_double:\n\tvfchdbs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochine\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_lt_double (vector double a, vector double b) +{ + return vec_all_lt (a, b); +} +/* { dg-final { scan-assembler-times all_lt_double:\n\tvfchdbs\t%v\[0-9\]*,%v26,%v24\n\tlhi\t%r2,1\n\tlochine\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_ge_double (vector double a, vector double b) +{ + return vec_all_ge (a, b); +} +/* { dg-final { scan-assembler-times all_ge_double:\n\tvfchedbs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochine\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_le_double (vector double a, vector double b) +{ + return vec_all_le (a, b); +} +/* { dg-final { scan-assembler-times all_le_double:\n\tvfchedbs\t%v\[0-9\]*,%v26,%v24\n\tlhi\t%r2,1\n\tlochine\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_eq_double (vector double a, vector double b) +{ + return vec_any_eq (a, b); +} +/* { dg-final { scan-assembler-times any_eq_double:\n\tvfcedbs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochinle\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_ne_double (vector double a, vector double b) +{ + return vec_any_ne (a, b); +} +/* { dg-final { scan-assembler-times any_ne_double:\n\tvfcedbs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochie\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_gt_double (vector double a, vector double b) +{ + return vec_any_gt (a, b); +} +/* { dg-final { scan-assembler-times any_gt_double:\n\tvfchdbs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochinle\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_lt_double (vector double a, vector double b) +{ + return vec_any_lt (a, b); +} +/* { dg-final { scan-assembler-times any_lt_double:\n\tvfchdbs\t%v\[0-9\]*,%v26,%v24\n\tlhi\t%r2,1\n\tlochinle\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_ge_double (vector double a, vector double b) +{ + return vec_any_ge (a, b); +} +/* { dg-final { scan-assembler-times any_ge_double:\n\tvfchedbs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochinle\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_le_double (vector double a, vector double b) +{ + return vec_any_le (a, b); +} +/* { dg-final { scan-assembler-times any_le_double:\n\tvfchedbs\t%v\[0-9\]*,%v26,%v24\n\tlhi\t%r2,1\n\tlochinle\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_eq_int (vector int a, vector int b) +{ + return vec_all_eq (a, b); +} +/* { dg-final { scan-assembler-times all_eq_int:\n\tvceqfs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochine\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_ne_int (vector int a, vector int b) +{ + return vec_all_ne (a, b); +} +/* { dg-final { scan-assembler-times all_ne_int:\n\tvceqfs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochile\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_gt_int (vector int a, vector int b) +{ + return vec_all_gt (a, b); +} +/* { dg-final { scan-assembler-times all_gt_int:\n\tvchfs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochine\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_lt_int (vector int a, vector int b) +{ + return vec_all_lt (a, b); +} +/* { dg-final { scan-assembler-times all_lt_int:\n\tvchfs\t%v\[0-9\]*,%v26,%v24\n\tlhi\t%r2,1\n\tlochine\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_ge_int (vector int a, vector int b) +{ + return vec_all_ge (a, b); +} +/* { dg-final { scan-assembler-times all_ge_int:\n\tvchfs\t%v\[0-9\]*,%v26,%v24\n\tlhi\t%r2,1\n\tlochile\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +all_le_int (vector int a, vector int b) +{ + return vec_all_le (a, b); +} +/* { dg-final { scan-assembler-times all_le_int:\n\tvchfs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochile\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_eq_int (vector int a, vector int b) +{ + return vec_any_eq (a, b); +} +/* { dg-final { scan-assembler-times any_eq_int:\n\tvceqfs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochinle\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_ne_int (vector int a, vector int b) +{ + return vec_any_ne (a, b); +} +/* { dg-final { scan-assembler-times any_ne_int:\n\tvceqfs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochie\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_gt_int (vector int a, vector int b) +{ + return vec_any_gt (a, b); +} +/* { dg-final { scan-assembler-times any_gt_int:\n\tvchfs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochinle\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_lt_int (vector int a, vector int b) +{ + return vec_any_lt (a, b); +} +/* { dg-final { scan-assembler-times any_lt_int:\n\tvchfs\t%v\[0-9\]*,%v26,%v24\n\tlhi\t%r2,1\n\tlochinle\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_ge_int (vector int a, vector int b) +{ + return vec_any_ge (a, b); +} +/* { dg-final { scan-assembler-times any_ge_int:\n\tvchfs\t%v\[0-9\]*,%v26,%v24\n\tlhi\t%r2,1\n\tlochie\t%r2,0 1 } } */ + +int __attribute__((noinline,noclone)) +any_le_int (vector int a, vector int b) +{ + return vec_any_le (a, b); +} +/* { dg-final { scan-assembler-times any_le_int:\n\tvchfs\t%v\[0-9\]*,%v24,%v26\n\tlhi\t%r2,1\n\tlochie\t%r2,0 1 } } */ + -- cgit v1.1 From eca9803844ddf459d3f5992aa88353603f0cb731 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Fri, 2 Dec 2016 08:22:34 +0000 Subject: S/390: Merge compare of compare results With this patch EQ and NE compares on CC mode reader patterns are folded. This allows using the result of the vec_all_* and vec_any_* builtins directly in a conditional jump instruction as in the attached testcase. gcc/ChangeLog: 2016-12-02 Andreas Krebbel * config/s390/s390-protos.h (s390_reverse_condition): New prototype. * config/s390/s390.c (s390_canonicalize_comparison): Fold compares of CC mode values. (s390_reverse_condition): New function. * config/s390/s390.h (REVERSE_CC_MODE, REVERSE_CONDITION): Define target macros. gcc/testsuite/ChangeLog: 2016-12-02 Andreas Krebbel * gcc.target/s390/zvector/vec-cmp-2.c: New test. From-SVN: r243155 --- gcc/ChangeLog | 10 ++++++++++ gcc/config/s390/s390-protos.h | 1 + gcc/config/s390/s390.c | 42 ++++++++++++++++++++++++++++++++++++++++++ gcc/config/s390/s390.h | 12 ++++++++++++ gcc/testsuite/ChangeLog | 4 ++++ 5 files changed, 69 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 2d55409..d06661e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,15 @@ 2016-12-02 Andreas Krebbel + * config/s390/s390-protos.h (s390_reverse_condition): New + prototype. + * config/s390/s390.c (s390_canonicalize_comparison): Fold compares + of CC mode values. + (s390_reverse_condition): New function. + * config/s390/s390.h (REVERSE_CC_MODE, REVERSE_CONDITION): Define + target macros. + +2016-12-02 Andreas Krebbel + * config/s390/s390-modes.def (CCVEQANY, CCVH, CCVHANY, CCVHU) (CCVHUANY): Remove modes. (CCVIH, CCVIHU, CCVIALL, CCVIANY, CCVFALL, CCVFANY): Add modes and diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h index 7ae98d4..000a677 100644 --- a/gcc/config/s390/s390-protos.h +++ b/gcc/config/s390/s390-protos.h @@ -119,6 +119,7 @@ extern void s390_expand_atomic (machine_mode, enum rtx_code, extern void s390_expand_tbegin (rtx, rtx, rtx, bool); extern void s390_expand_vec_compare (rtx, enum rtx_code, rtx, rtx); extern void s390_expand_vec_compare_cc (rtx, enum rtx_code, rtx, rtx, bool); +extern enum rtx_code s390_reverse_condition (machine_mode, enum rtx_code); extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx); extern void s390_expand_vec_init (rtx, rtx); extern rtx s390_return_addr_rtx (int, rtx); diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 445c147..dab4f43 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -1722,6 +1722,31 @@ s390_canonicalize_comparison (int *code, rtx *op0, rtx *op1, } tmp = *op0; *op0 = *op1; *op1 = tmp; } + + /* A comparison result is compared against zero. Replace it with + the (perhaps inverted) original comparison. + This probably should be done by simplify_relational_operation. */ + if ((*code == EQ || *code == NE) + && *op1 == const0_rtx + && COMPARISON_P (*op0) + && CC_REG_P (XEXP (*op0, 0))) + { + enum rtx_code new_code; + + if (*code == EQ) + new_code = reversed_comparison_code_parts (GET_CODE (*op0), + XEXP (*op0, 0), + XEXP (*op1, 0), NULL); + else + new_code = GET_CODE (*op0); + + if (new_code != UNKNOWN) + { + *code = new_code; + *op1 = XEXP (*op0, 1); + *op0 = XEXP (*op0, 0); + } + } } /* Helper function for s390_emit_compare. If possible emit a 64 bit @@ -6343,6 +6368,23 @@ s390_expand_vec_compare_cc (rtx target, enum rtx_code code, tmp_reg, target)); } +/* Invert the comparison CODE applied to a CC mode. This is only safe + if we know whether there result was created by a floating point + compare or not. For the CCV modes this is encoded as part of the + mode. */ +enum rtx_code +s390_reverse_condition (machine_mode mode, enum rtx_code code) +{ + /* Reversal of FP compares takes care -- an ordered compare + becomes an unordered compare and vice versa. */ + if (mode == CCVFALLmode || mode == CCVFANYmode) + return reverse_condition_maybe_unordered (code); + else if (mode == CCVIALLmode || mode == CCVIANYmode) + return reverse_condition (code); + else + gcc_unreachable (); +} + /* Generate a vector comparison expression loading either elements of THEN or ELS into TARGET depending on the comparison COND of CMP_OP1 and CMP_OP2. */ diff --git a/gcc/config/s390/s390.h b/gcc/config/s390/s390.h index 6be4d34..1d6d7b2 100644 --- a/gcc/config/s390/s390.h +++ b/gcc/config/s390/s390.h @@ -513,6 +513,18 @@ extern const char *s390_host_detect_local_cpu (int argc, const char **argv); #define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS) \ s390_cannot_change_mode_class ((FROM), (TO), (CLASS)) +/* We can reverse a CC mode safely if we know whether it comes from a + floating point compare or not. With the vector modes it is encoded + as part of the mode. + FIXME: It might make sense to do this for other cc modes as well. */ +#define REVERSIBLE_CC_MODE(MODE) \ + ((MODE) == CCVIALLmode || (MODE) == CCVIANYmode \ + || (MODE) == CCVFALLmode || (MODE) == CCVFANYmode) + +/* Given a condition code and a mode, return the inverse condition. */ +#define REVERSE_CONDITION(CODE, MODE) s390_reverse_condition (MODE, CODE) + + /* Register classes. */ /* We use the following register classes: diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 2ecf8f9..dc269ef 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,9 @@ 2016-12-02 Andreas Krebbel + * gcc.target/s390/zvector/vec-cmp-2.c: New test. + +2016-12-02 Andreas Krebbel + * gcc.target/s390/vector/vec-scalar-cmp-1.c: Fix and harden the pattern checks. * gcc.target/s390/zvector/vec-cmp-1.c: New test. -- cgit v1.1 From f00bc26c002e81a23e84c8c359a97c88b6b95447 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Fri, 2 Dec 2016 08:23:19 +0000 Subject: Add testcase missing in last commit. gcc/testsuite/ChangeLog: 2016-12-02 Andreas Krebbel * gcc.target/s390/zvector/vec-cmp-2.c: New test. From-SVN: r243156 --- gcc/testsuite/gcc.target/s390/zvector/vec-cmp-2.c | 203 ++++++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 gcc/testsuite/gcc.target/s390/zvector/vec-cmp-2.c diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec-cmp-2.c b/gcc/testsuite/gcc.target/s390/zvector/vec-cmp-2.c new file mode 100644 index 0000000..0711f9c --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/zvector/vec-cmp-2.c @@ -0,0 +1,203 @@ +/* Similiar to vec-cmp-1.c but requires that + s390_canonicalize_comparison is able to merge the the two nested + compares. */ + +/* { dg-do compile { target { s390*-*-* } } } */ +/* { dg-options "-O3 -mzarch -march=z13 -mzvector -fno-asynchronous-unwind-tables" } */ + +#include + +extern void foo (void); + +int __attribute__((noinline,noclone)) +all_eq_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_all_eq (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_eq_double:\n\tvfcedbs\t%v\[0-9\]*,%v24,%v26\n\tjne 1 } } */ + +int __attribute__((noinline,noclone)) +all_ne_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_all_ne (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_ne_double:\n\tvfcedbs\t%v\[0-9\]*,%v24,%v26\n\tjle 1 } } */ + +int __attribute__((noinline,noclone)) +all_gt_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_all_gt (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_gt_double:\n\tvfchdbs\t%v\[0-9\]*,%v24,%v26\n\tjne 1 } } */ + +int __attribute__((noinline,noclone)) +all_lt_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_all_lt (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_lt_double:\n\tvfchdbs\t%v\[0-9\]*,%v26,%v24\n\tjne 1 } } */ + +int __attribute__((noinline,noclone)) +all_ge_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_all_ge (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_ge_double:\n\tvfchedbs\t%v\[0-9\]*,%v24,%v26\n\tjne 1 } } */ + +int __attribute__((noinline,noclone)) +all_le_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_all_le (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_le_double:\n\tvfchedbs\t%v\[0-9\]*,%v26,%v24\n\tjne 1 } } */ + +int __attribute__((noinline,noclone)) +any_eq_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_any_eq (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_eq_double:\n\tvfcedbs\t%v\[0-9\]*,%v24,%v26\n\tjnle 1 } } */ + +int __attribute__((noinline,noclone)) +any_ne_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_any_ne (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_ne_double:\n\tvfcedbs\t%v\[0-9\]*,%v24,%v26\n\tje 1 } } */ + +int __attribute__((noinline,noclone)) +any_gt_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_any_gt (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_gt_double:\n\tvfchdbs\t%v\[0-9\]*,%v24,%v26\n\tjnle 1 } } */ + +int __attribute__((noinline,noclone)) +any_lt_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_any_lt (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_lt_double:\n\tvfchdbs\t%v\[0-9\]*,%v26,%v24\n\tjnle 1 } } */ + +int __attribute__((noinline,noclone)) +any_ge_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_any_ge (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_ge_double:\n\tvfchedbs\t%v\[0-9\]*,%v24,%v26\n\tjnle 1 } } */ + +int __attribute__((noinline,noclone)) +any_le_double (vector double a, vector double b) +{ + if (__builtin_expect (vec_any_le (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_le_double:\n\tvfchedbs\t%v\[0-9\]*,%v26,%v24\n\tjnle 1 } } */ + +int __attribute__((noinline,noclone)) +all_eq_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_all_eq (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_eq_int:\n\tvceqfs\t%v\[0-9\]*,%v24,%v26\n\tjne 1 } } */ + +int __attribute__((noinline,noclone)) +all_ne_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_all_ne (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_ne_int:\n\tvceqfs\t%v\[0-9\]*,%v24,%v26\n\tjle 1 } } */ + +int __attribute__((noinline,noclone)) +all_gt_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_all_gt (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_gt_int:\n\tvchfs\t%v\[0-9\]*,%v24,%v26\n\tjne 1 } } */ + +int __attribute__((noinline,noclone)) +all_lt_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_all_lt (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_lt_int:\n\tvchfs\t%v\[0-9\]*,%v26,%v24\n\tjne 1 } } */ + +int __attribute__((noinline,noclone)) +all_ge_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_all_ge (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_ge_int:\n\tvchfs\t%v\[0-9\]*,%v26,%v24\n\tjle 1 } } */ + +int __attribute__((noinline,noclone)) +all_le_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_all_le (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times all_le_int:\n\tvchfs\t%v\[0-9\]*,%v24,%v26\n\tjle 1 } } */ + +int __attribute__((noinline,noclone)) +any_eq_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_any_eq (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_eq_int:\n\tvceqfs\t%v\[0-9\]*,%v24,%v26\n\tjnle 1 } } */ + +int __attribute__((noinline,noclone)) +any_ne_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_any_ne (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_ne_int:\n\tvceqfs\t%v\[0-9\]*,%v24,%v26\n\tje 1 } } */ + +int __attribute__((noinline,noclone)) +any_gt_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_any_gt (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_gt_int:\n\tvchfs\t%v\[0-9\]*,%v24,%v26\n\tjnle 1 } } */ + +int __attribute__((noinline,noclone)) +any_lt_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_any_lt (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_lt_int:\n\tvchfs\t%v\[0-9\]*,%v26,%v24\n\tjnle 1 } } */ + +int __attribute__((noinline,noclone)) +any_ge_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_any_ge (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_ge_int:\n\tvchfs\t%v\[0-9\]*,%v26,%v24\n\tje 1 } } */ + +int __attribute__((noinline,noclone)) +any_le_int (vector int a, vector int b) +{ + if (__builtin_expect (vec_any_le (a, b), 1)) + foo (); +} +/* { dg-final { scan-assembler-times any_le_int:\n\tvchfs\t%v\[0-9\]*,%v24,%v26\n\tje 1 } } */ + -- cgit v1.1 From 9ad49cdb5340bcaab0ecc0574ed754af9ba71283 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Fri, 2 Dec 2016 08:24:27 +0000 Subject: S/390: Add vector pack/unpack patterns. gcc/ChangeLog: 2016-12-02 Andreas Krebbel * config/s390/vector.md (vec_halfhalf): New mode iterator. ("vec_pack_trunc_", "vec_pack_ssat_") ("vec_pack_usat_", "vec_unpacks_hi_v16qi") ("vec_unpacks_low_v16qi", "vec_unpacku_hi_v16qi") ("vec_unpacku_low_v16qi", "vec_unpacks_hi_v8hi") ("vec_unpacks_lo_v8hi", "vec_unpacku_hi_v8hi") ("vec_unpacku_lo_v8hi", "vec_unpacks_hi_v4si") ("vec_unpacks_lo_v4si", "vec_unpacku_hi_v4si") ("vec_unpacku_lo_v4si"): New pattern definitions. * config/s390/vx-builtins.md: Move VI_HW_HSD mode iterator to vector.md. From-SVN: r243157 --- gcc/ChangeLog | 14 +++ gcc/config/s390/vector.md | 198 +++++++++++++++++++++++++++++++++++++++-- gcc/config/s390/vx-builtins.md | 1 - 3 files changed, 203 insertions(+), 10 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d06661e..d2ecd0f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,19 @@ 2016-12-02 Andreas Krebbel + * config/s390/vector.md (vec_halfhalf): New mode iterator. + ("vec_pack_trunc_", "vec_pack_ssat_") + ("vec_pack_usat_", "vec_unpacks_hi_v16qi") + ("vec_unpacks_low_v16qi", "vec_unpacku_hi_v16qi") + ("vec_unpacku_low_v16qi", "vec_unpacks_hi_v8hi") + ("vec_unpacks_lo_v8hi", "vec_unpacku_hi_v8hi") + ("vec_unpacku_lo_v8hi", "vec_unpacks_hi_v4si") + ("vec_unpacks_lo_v4si", "vec_unpacku_hi_v4si") + ("vec_unpacku_lo_v4si"): New pattern definitions. + * config/s390/vx-builtins.md: Move VI_HW_HSD mode iterator to + vector.md. + +2016-12-02 Andreas Krebbel + * config/s390/s390-protos.h (s390_reverse_condition): New prototype. * config/s390/s390.c (s390_canonicalize_comparison): Fold compares diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index bc4f8da..d446d5f 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -38,7 +38,8 @@ (define_mode_iterator VIT_HW [V16QI V8HI V4SI V2DI V1TI TI]) (define_mode_iterator VI_HW [V16QI V8HI V4SI V2DI]) (define_mode_iterator VI_HW_QHS [V16QI V8HI V4SI]) -(define_mode_iterator VI_HW_HS [V8HI V4SI]) +(define_mode_iterator VI_HW_HSD [V8HI V4SI V2DI]) +(define_mode_iterator VI_HW_HS [V8HI V4SI]) (define_mode_iterator VI_HW_QH [V16QI V8HI]) ; All integer vector modes supported in a vector register + TImode @@ -114,6 +115,13 @@ (V1DF "V2SF") (V2DF "V4SF") (V1TF "V1DF")]) +; Vector with half the element size AND half the number of elements. +(define_mode_attr vec_halfhalf + [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI") + (V2SI "V2HI") (V4SI "V4HI") + (V2DI "V2SI") + (V2DF "V2SF")]) + ; The comparisons not setting CC iterate over the rtx code. (define_code_iterator VFCMP_HW_OP [eq gt ge]) (define_code_attr asm_fcmp_op [(eq "e") (gt "h") (ge "he")]) @@ -1223,6 +1231,185 @@ "vsel\t%v0,%2,%1,%3" [(set_attr "op_type" "VRR")]) +; vec_pack_trunc + +; vpkh, vpkf, vpkg +(define_insn "vec_pack_trunc_" + [(set (match_operand: 0 "register_operand" "=v") + (vec_concat: + (truncate: + (match_operand:VI_HW_HSD 1 "register_operand" "v")) + (truncate: + (match_operand:VI_HW_HSD 2 "register_operand" "v"))))] + "TARGET_VX" + "vpk\t%0,%1,%2" + [(set_attr "op_type" "VRR")]) + +; vpksh, vpksf, vpksg +(define_insn "vec_pack_ssat_" + [(set (match_operand: 0 "register_operand" "=v") + (vec_concat: + (ss_truncate: + (match_operand:VI_HW_HSD 1 "register_operand" "v")) + (ss_truncate: + (match_operand:VI_HW_HSD 2 "register_operand" "v"))))] + "TARGET_VX" + "vpks\t%0,%1,%2" + [(set_attr "op_type" "VRR")]) + +; vpklsh, vpklsf, vpklsg +(define_insn "vec_pack_usat_" + [(set (match_operand: 0 "register_operand" "=v") + (vec_concat: + (us_truncate: + (match_operand:VI_HW_HSD 1 "register_operand" "v")) + (us_truncate: + (match_operand:VI_HW_HSD 2 "register_operand" "v"))))] + "TARGET_VX" + "vpkls\t%0,%1,%2" + [(set_attr "op_type" "VRR")]) + +;; vector unpack v16qi + +; signed + +(define_insn "vec_unpacks_hi_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (sign_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "register_operand" "v") + (parallel [(const_int 0)(const_int 1)(const_int 2)(const_int 3) + (const_int 4)(const_int 5)(const_int 6)(const_int 7)]))))] + "TARGET_VX" + "vuphb\t%0,%1" + [(set_attr "op_type" "VRR")]) + +(define_insn "vec_unpacks_low_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (sign_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "register_operand" "v") + (parallel [(const_int 8) (const_int 9) (const_int 10)(const_int 11) + (const_int 12)(const_int 13)(const_int 14)(const_int 15)]))))] + "TARGET_VX" + "vuplb\t%0,%1" + [(set_attr "op_type" "VRR")]) + +; unsigned + +(define_insn "vec_unpacku_hi_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (zero_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "register_operand" "v") + (parallel [(const_int 0)(const_int 1)(const_int 2)(const_int 3) + (const_int 4)(const_int 5)(const_int 6)(const_int 7)]))))] + "TARGET_VX" + "vuplhb\t%0,%1" + [(set_attr "op_type" "VRR")]) + +(define_insn "vec_unpacku_low_v16qi" + [(set (match_operand:V8HI 0 "register_operand" "=v") + (zero_extend:V8HI + (vec_select:V8QI + (match_operand:V16QI 1 "register_operand" "v") + (parallel [(const_int 8) (const_int 9) (const_int 10)(const_int 11) + (const_int 12)(const_int 13)(const_int 14)(const_int 15)]))))] + "TARGET_VX" + "vupllb\t%0,%1" + [(set_attr "op_type" "VRR")]) + +;; vector unpack v8hi + +; signed + +(define_insn "vec_unpacks_hi_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand" "v") + (parallel [(const_int 0)(const_int 1)(const_int 2)(const_int 3)]))))] + "TARGET_VX" + "vuphh\t%0,%1" + [(set_attr "op_type" "VRR")]) + +(define_insn "vec_unpacks_lo_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (sign_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand" "v") + (parallel [(const_int 4)(const_int 5)(const_int 6)(const_int 7)]))))] + "TARGET_VX" + "vuplhw\t%0,%1" + [(set_attr "op_type" "VRR")]) + +; unsigned + +(define_insn "vec_unpacku_hi_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (zero_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand" "v") + (parallel [(const_int 0)(const_int 1)(const_int 2)(const_int 3)]))))] + "TARGET_VX" + "vuplhh\t%0,%1" + [(set_attr "op_type" "VRR")]) + +(define_insn "vec_unpacku_lo_v8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (zero_extend:V4SI + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand" "v") + (parallel [(const_int 4)(const_int 5)(const_int 6)(const_int 7)]))))] + "TARGET_VX" + "vupllh\t%0,%1" + [(set_attr "op_type" "VRR")]) + +;; vector unpack v4si + +; signed + +(define_insn "vec_unpacks_hi_v4si" + [(set (match_operand:V2DI 0 "register_operand" "=v") + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "register_operand" "v") + (parallel [(const_int 0)(const_int 1)]))))] + "TARGET_VX" + "vuphf\t%0,%1" + [(set_attr "op_type" "VRR")]) + +(define_insn "vec_unpacks_lo_v4si" + [(set (match_operand:V2DI 0 "register_operand" "=v") + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "register_operand" "v") + (parallel [(const_int 2)(const_int 3)]))))] + "TARGET_VX" + "vuplf\t%0,%1" + [(set_attr "op_type" "VRR")]) + +; unsigned + +(define_insn "vec_unpacku_hi_v4si" + [(set (match_operand:V2DI 0 "register_operand" "=v") + (zero_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "register_operand" "v") + (parallel [(const_int 0)(const_int 1)]))))] + "TARGET_VX" + "vuplhf\t%0,%1" + [(set_attr "op_type" "VRR")]) + +(define_insn "vec_unpacku_lo_v4si" + [(set (match_operand:V2DI 0 "register_operand" "=v") + (zero_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "register_operand" "v") + (parallel [(const_int 2)(const_int 3)]))))] + "TARGET_VX" + "vupllf\t%0,%1" + [(set_attr "op_type" "VRR")]) ; reduc_smin @@ -1233,15 +1420,8 @@ ; vec_shl vrep + vsl ; vec_shr -; vec_pack_trunc -; vec_pack_ssat -; vec_pack_usat -; vec_pack_sfix_trunc +; vec_pack_sfix_trunc: convert + pack ? ; vec_pack_ufix_trunc -; vec_unpacks_hi -; vec_unpacks_low -; vec_unpacku_hi -; vec_unpacku_low ; vec_unpacks_float_hi ; vec_unpacks_float_lo ; vec_unpacku_float_hi diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md index 51d022c..b3818ee 100644 --- a/gcc/config/s390/vx-builtins.md +++ b/gcc/config/s390/vx-builtins.md @@ -24,7 +24,6 @@ (define_mode_iterator V_HW_32_64 [V4SI V2DI V2DF]) (define_mode_iterator VI_HW_SD [V4SI V2DI]) (define_mode_iterator V_HW_HSD [V8HI V4SI V2DI V2DF]) -(define_mode_iterator VI_HW_HSD [V8HI V4SI V2DI]) ; The element type of the vector with floating point modes translated ; to int modes of the same size. -- cgit v1.1 From 7f5fc63362a675a216f1475bb08fd42d8274f8cf Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Fri, 2 Dec 2016 08:25:27 +0000 Subject: S/390: Define vectorization_cost hook Define the vectorization_cost hook. The only change right now compared to the default implementation is the reduced costs for unaligned loads/stores. This is supposed to prevent unnecessary loop peeling performed to reach better alignments. Further tuning of this hook is required. -Andreas- gcc/ChangeLog: 2016-12-02 Andreas Krebbel * gcc/config/s390/s390.c (s390_builtin_vectorization_cost): New function. (TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): Define target macro. gcc/testsuite/ChangeLog: 2016-12-02 Andreas Krebbel * gcc.target/s390/vector/vec-nopeel-1.c: New test. From-SVN: r243158 --- gcc/ChangeLog | 7 ++++ gcc/config/s390/s390.c | 37 ++++++++++++++++++++++ gcc/testsuite/ChangeLog | 4 +++ .../gcc.target/s390/vector/vec-nopeel-1.c | 17 ++++++++++ 4 files changed, 65 insertions(+) create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-nopeel-1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d2ecd0f..e5f8345 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,12 @@ 2016-12-02 Andreas Krebbel + * gcc/config/s390/s390.c (s390_builtin_vectorization_cost): New + function. + (TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): Define target + macro. + +2016-12-02 Andreas Krebbel + * config/s390/vector.md (vec_halfhalf): New mode iterator. ("vec_pack_trunc_", "vec_pack_ssat_") ("vec_pack_usat_", "vec_unpacks_hi_v16qi") diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index dab4f43..767666e 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -3674,6 +3674,40 @@ s390_address_cost (rtx addr, machine_mode mode ATTRIBUTE_UNUSED, return ad.indx? COSTS_N_INSNS (1) + 1 : COSTS_N_INSNS (1); } +/* Implement targetm.vectorize.builtin_vectorization_cost. */ +static int +s390_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype, + int misalign ATTRIBUTE_UNUSED) +{ + switch (type_of_cost) + { + case scalar_stmt: + case scalar_load: + case scalar_store: + case vector_stmt: + case vector_load: + case vector_store: + case vec_to_scalar: + case scalar_to_vec: + case cond_branch_not_taken: + case vec_perm: + case vec_promote_demote: + case unaligned_load: + case unaligned_store: + return 1; + + case cond_branch_taken: + return 3; + + case vec_construct: + return TYPE_VECTOR_SUBPARTS (vectype) - 1; + + default: + gcc_unreachable (); + } +} + /* If OP is a SYMBOL_REF of a thread-local symbol, return its TLS mode, otherwise return 0. */ @@ -15428,6 +15462,9 @@ s390_excess_precision (enum excess_precision_type type) #define TARGET_REGISTER_MOVE_COST s390_register_move_cost #undef TARGET_MEMORY_MOVE_COST #define TARGET_MEMORY_MOVE_COST s390_memory_move_cost +#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST +#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \ + s390_builtin_vectorization_cost #undef TARGET_MACHINE_DEPENDENT_REORG #define TARGET_MACHINE_DEPENDENT_REORG s390_reorg diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index dc269ef..e39ab1c 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,9 @@ 2016-12-02 Andreas Krebbel + * gcc.target/s390/vector/vec-nopeel-1.c: New test. + +2016-12-02 Andreas Krebbel + * gcc.target/s390/zvector/vec-cmp-2.c: New test. 2016-12-02 Andreas Krebbel diff --git a/gcc/testsuite/gcc.target/s390/vector/vec-nopeel-1.c b/gcc/testsuite/gcc.target/s390/vector/vec-nopeel-1.c new file mode 100644 index 0000000..581c371 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vector/vec-nopeel-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch -march=z13" } */ +/* { dg-require-effective-target vector } */ + +int +foo (int * restrict a, int n) +{ + int i, result = 0; + + for (i = 0; i < n * 4; i++) + result += a[i]; + return result; +} + +/* We do NOT want this loop to get peeled. Without peeling no scalar + memory add should appear. */ +/* { dg-final { scan-assembler-not "\ta\t" } } */ -- cgit v1.1 From 8f61415f1f776d35f0d616bd662019a659a6e536 Mon Sep 17 00:00:00 2001 From: Dominik Vogt Date: Fri, 2 Dec 2016 08:26:19 +0000 Subject: PR target/77822: Add helper macro EXTRACT_ARGS_IN_RANGE to system.h. The macro can be used to validate the arguments of zero_extract and sign_extract to fix this problem: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77822 gcc/ChangeLog: 2016-12-02 Dominik Vogt PR target/77822 * rtl.h (EXTRACT_ARGS_IN_RANGE): New. From-SVN: r243159 --- gcc/ChangeLog | 5 +++++ gcc/rtl.h | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e5f8345..8c71c21 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Dominik Vogt + + PR target/77822 + * rtl.h (EXTRACT_ARGS_IN_RANGE): New. + 2016-12-02 Andreas Krebbel * gcc/config/s390/s390.c (s390_builtin_vectorization_cost): New diff --git a/gcc/rtl.h b/gcc/rtl.h index 5fde698..a5efa28 100644 --- a/gcc/rtl.h +++ b/gcc/rtl.h @@ -2694,6 +2694,16 @@ get_full_set_src_cost (rtx x, machine_mode mode, struct full_rtx_costs *c) } #endif +/* A convenience macro to validate the arguments of a zero_extract + expression. It determines whether SIZE lies inclusively within + [1, RANGE], POS lies inclusively within between [0, RANGE - 1] + and the sum lies inclusively within [1, RANGE]. RANGE must be + >= 1, but SIZE and POS may be negative. */ +#define EXTRACT_ARGS_IN_RANGE(SIZE, POS, RANGE) \ + (IN_RANGE ((POS), 0, (unsigned HOST_WIDE_INT) (RANGE) - 1) \ + && IN_RANGE ((SIZE), 1, (unsigned HOST_WIDE_INT) (RANGE) \ + - (unsigned HOST_WIDE_INT)(POS))) + /* In explow.c */ extern HOST_WIDE_INT trunc_int_for_mode (HOST_WIDE_INT, machine_mode); extern rtx plus_constant (machine_mode, rtx, HOST_WIDE_INT, bool = false); -- cgit v1.1 From 0f6f72e80525f14e91d4d1ee6d3bd91fd7c96859 Mon Sep 17 00:00:00 2001 From: Dominik Vogt Date: Fri, 2 Dec 2016 08:30:16 +0000 Subject: PR target/77822: S390: Validate argument range of {zero,sign}_extract. With some undefined code, combine generates patterns where the arguments to *_extract are out of range, e.b. a negative bit position. If the s390 backend accepts these, they lead to not just undefined behaviour but invalid assembly instructions (argument out of the allowed range). So this patch makes sure that the rtl expressions with out of range arguments are rejected. gcc/ChangeLog: 2016-12-02 Dominik Vogt PR target/77822 * config/s390/s390.md ("extzv") ("*extzv") ("*extzvdi_lshiftrt") ("*_ior_and_sr_ze") ("*extract1bitdi") ("*insv", "*insv_rnsbg_noshift") ("*insv_rnsbg_srl", "*insv_mem_reg") ("*insvdi_mem_reghigh", "*insvdi_reg_imm"): Use EXTRACT_ARGS_IN_RANGE to validate the arguments of zero_extract and sign_extract. gcc/testsuite/ChangeLog: 2016-12-02 Dominik Vogt PR target/77822 * gcc.target/s390/s390.exp: Support .C tests. * gcc.target/s390/pr77822-2.c: New test. * gcc.target/s390/pr77822-1.C: New test. From-SVN: r243160 --- gcc/ChangeLog | 13 ++ gcc/config/s390/s390.md | 20 +- gcc/testsuite/ChangeLog | 7 + gcc/testsuite/gcc.target/s390/pr77822-1.C | 21 ++ gcc/testsuite/gcc.target/s390/pr77822-2.c | 307 ++++++++++++++++++++++++++++++ gcc/testsuite/gcc.target/s390/s390.exp | 8 +- 6 files changed, 369 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/pr77822-1.C create mode 100644 gcc/testsuite/gcc.target/s390/pr77822-2.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 8c71c21..e357932 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,6 +1,19 @@ 2016-12-02 Dominik Vogt PR target/77822 + * config/s390/s390.md ("extzv") + ("*extzv") + ("*extzvdi_lshiftrt") + ("*_ior_and_sr_ze") + ("*extract1bitdi") + ("*insv", "*insv_rnsbg_noshift") + ("*insv_rnsbg_srl", "*insv_mem_reg") + ("*insvdi_mem_reghigh", "*insvdi_reg_imm"): Use EXTRACT_ARGS_IN_RANGE + to validate the arguments of zero_extract and sign_extract. + +2016-12-02 Dominik Vogt + + PR target/77822 * rtl.h (EXTRACT_ARGS_IN_RANGE): New. 2016-12-02 Andreas Krebbel diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 5844e28..4f2effd 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -3741,6 +3741,8 @@ (clobber (reg:CC CC_REGNUM))])] "TARGET_Z10" { + if (! EXTRACT_ARGS_IN_RANGE (INTVAL (operands[2]), INTVAL (operands[3]), 64)) + FAIL; /* Starting with zEC12 there is risbgn not clobbering CC. */ if (TARGET_ZEC12) { @@ -3760,7 +3762,9 @@ (match_operand 2 "const_int_operand" "") ; size (match_operand 3 "const_int_operand" ""))) ; start ] - "" + " + && EXTRACT_ARGS_IN_RANGE (INTVAL (operands[2]), INTVAL (operands[3]), + GET_MODE_BITSIZE (mode))" "\t%0,%1,64-%2,128+63,%3+%2" ; dst, src, start, end, shift [(set_attr "op_type" "RIE") (set_attr "z10prop" "z10_super_E1")]) @@ -3773,6 +3777,7 @@ (lshiftrt:DI (match_operand:DI 3 "register_operand" "d") (match_operand:DI 4 "nonzero_shift_count_operand" "")))] " + && EXTRACT_ARGS_IN_RANGE (INTVAL (operands[1]), INTVAL (operands[2]), 64) && 64 - UINTVAL (operands[4]) >= UINTVAL (operands[1])" "\t%0,%3,%2,%2+%1-1,128-%2-%1-%4" [(set_attr "op_type" "RIE") @@ -3791,6 +3796,7 @@ (match_operand 5 "const_int_operand" "")) ; start 4)))] " + && EXTRACT_ARGS_IN_RANGE (INTVAL (operands[4]), INTVAL (operands[5]), 64) && UINTVAL (operands[2]) == (~(0ULL) << UINTVAL (operands[4]))" "\t%0,%3,64-%4,63,%4+%5" [(set_attr "op_type" "RIE") @@ -3804,7 +3810,8 @@ (const_int 1) ; size (match_operand 2 "const_int_operand" "")) ; start (const_int 0)))] - "" + " + && EXTRACT_ARGS_IN_RANGE (1, INTVAL (operands[2]), 64)" "\t%0,%1,64-1,128+63,%2+1" ; dst, src, start, end, shift [(set_attr "op_type" "RIE") (set_attr "z10prop" "z10_super_E1")]) @@ -3919,6 +3926,8 @@ (match_operand 2 "const_int_operand" "I")) ; pos (match_operand:GPR 3 "nonimmediate_operand" "d"))] " + && EXTRACT_ARGS_IN_RANGE (INTVAL (operands[1]), INTVAL (operands[2]), + GET_MODE_BITSIZE (mode)) && (INTVAL (operands[1]) + INTVAL (operands[2])) <= " "\t%0,%3,%2,%2+%1-1,-%2-%1" [(set_attr "op_type" "RIE") @@ -4214,6 +4223,7 @@ (match_operand:DI 3 "nonimmediate_operand" "d"))) (clobber (reg:CC CC_REGNUM))] "TARGET_Z10 + && EXTRACT_ARGS_IN_RANGE (INTVAL (operands[1]), INTVAL (operands[2]), 64) && INTVAL (operands[1]) + INTVAL (operands[2]) == 64" "rnsbg\t%0,%3,%2,63,0" [(set_attr "op_type" "RIE")]) @@ -4230,6 +4240,7 @@ (match_operand:DI 4 "nonimmediate_operand" "d"))) (clobber (reg:CC CC_REGNUM))] "TARGET_Z10 + && EXTRACT_ARGS_IN_RANGE (INTVAL (operands[1]), INTVAL (operands[2]), 64) && INTVAL (operands[3]) == 64 - INTVAL (operands[1]) - INTVAL (operands[2])" "rnsbg\t%0,%4,%2,%2+%1-1,%3" [(set_attr "op_type" "RIE")]) @@ -4239,7 +4250,8 @@ (match_operand 1 "const_int_operand" "n,n") (const_int 0)) (match_operand:W 2 "register_operand" "d,d"))] - "INTVAL (operands[1]) > 0 + "EXTRACT_ARGS_IN_RANGE (INTVAL (operands[1]), 0, 64) + && INTVAL (operands[1]) > 0 && INTVAL (operands[1]) <= GET_MODE_BITSIZE (SImode) && INTVAL (operands[1]) % BITS_PER_UNIT == 0" { @@ -4260,6 +4272,7 @@ (lshiftrt:DI (match_operand:DI 2 "register_operand" "d") (const_int 32)))] "TARGET_ZARCH + && EXTRACT_ARGS_IN_RANGE (INTVAL (operands[1]), 0, 64) && INTVAL (operands[1]) > 0 && INTVAL (operands[1]) <= GET_MODE_BITSIZE (SImode) && INTVAL (operands[1]) % BITS_PER_UNIT == 0" @@ -4278,6 +4291,7 @@ (match_operand 1 "const_int_operand" "n")) (match_operand:DI 2 "const_int_operand" "n"))] "TARGET_ZARCH + && EXTRACT_ARGS_IN_RANGE (16, INTVAL (operands[1]), 64) && INTVAL (operands[1]) >= 0 && INTVAL (operands[1]) < BITS_PER_WORD && INTVAL (operands[1]) % 16 == 0" diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index e39ab1c..447aaf6 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2016-12-02 Dominik Vogt + + PR target/77822 + * gcc.target/s390/s390.exp: Support .C tests. + * gcc.target/s390/pr77822-2.c: New test. + * gcc.target/s390/pr77822-1.C: New test. + 2016-12-02 Andreas Krebbel * gcc.target/s390/vector/vec-nopeel-1.c: New test. diff --git a/gcc/testsuite/gcc.target/s390/pr77822-1.C b/gcc/testsuite/gcc.target/s390/pr77822-1.C new file mode 100644 index 0000000..bd5a9b4 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/pr77822-1.C @@ -0,0 +1,21 @@ +/* Regression test for PR/77822. */ + +/* { dg-do compile } */ +/* { dg-options "-O3 -march=zEC12" } */ + +class A { + void m_fn1(); + char m_datawidth; + char m_subunits; + int m_subunit_infos[]; +}; +int a; +long b; +void A::m_fn1() { + int c = 32, d = m_datawidth / c; + for (int e = 0; e < d; e++) { + int f = e * 32; + if (b >> f & 1) + m_subunit_infos[m_subunits] = a; + } +} diff --git a/gcc/testsuite/gcc.target/s390/pr77822-2.c b/gcc/testsuite/gcc.target/s390/pr77822-2.c new file mode 100644 index 0000000..6789152 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/pr77822-2.c @@ -0,0 +1,307 @@ +/* This testcase checks that the shift operand of r*sbg instructions is in + range. */ + +/* { dg-do compile } */ +/* { dg-options "-O3 -march=zEC12 -Wno-shift-count-overflow" } */ + +int g; + +void pos_ll_129 (long long b) +{ + if (b >> 129 & 1) + g = b; +} + +void sizepos_ll_134 (long long b) +{ + if (b >> 134 & 1) + g = b; +} + +void pos_ll_65 (long long b) +{ + if (b >> 65 & 1) + g = b; +} + +void sizepos_ll_70 (long long b) +{ + if (b >> 70 & 1) + g = b; +} + +void pos_ll_33 (long long b) +{ + if (b >> 33 & 1) + g = b; +} + +void sizepos_ll_38 (long long b) +{ + if (b >> 38 & 1) + g = b; +} + +void pos_ll_17 (long long b) +{ + if (b >> 17 & 1) + g = b; +} + +void sizepos_ll_22 (long long b) +{ + if (b >> 22 & 1) + g = b; +} + +void pos_ll_8 (long long b) +{ + if (b >> 8 & 1) + g = b; +} + +void sizepos_ll_13 (long long b) +{ + if (b >> 13 & 1) + g = b; +} + +void pos_l_129 (long b) +{ + if (b >> 129 & 1) + g = b; +} + +void sizepos_l_134 (long b) +{ + if (b >> 134 & 1) + g = b; +} + +void pos_l_65 (long b) +{ + if (b >> 65 & 1) + g = b; +} + +void sizepos_l_70 (long b) +{ + if (b >> 70 & 1) + g = b; +} + +void pos_l_33 (long b) +{ + if (b >> 33 & 1) + g = b; +} + +void sizepos_l_38 (long b) +{ + if (b >> 38 & 1) + g = b; +} + +void pos_l_17 (long b) +{ + if (b >> 17 & 1) + g = b; +} + +void sizepos_l_22 (long b) +{ + if (b >> 22 & 1) + g = b; +} + +void pos_l_8 (long b) +{ + if (b >> 8 & 1) + g = b; +} + +void sizepos_l_13 (long b) +{ + if (b >> 13 & 1) + g = b; +} + +void pos_i_129 (int b) +{ + if (b >> 129 & 1) + g = b; +} + +void sizepos_i_134 (int b) +{ + if (b >> 134 & 1) + g = b; +} + +void pos_i_65 (int b) +{ + if (b >> 65 & 1) + g = b; +} + +void sizepos_i_70 (int b) +{ + if (b >> 70 & 1) + g = b; +} + +void pos_i_33 (int b) +{ + if (b >> 33 & 1) + g = b; +} + +void sizepos_i_38 (int b) +{ + if (b >> 38 & 1) + g = b; +} + +void pos_i_17 (int b) +{ + if (b >> 17 & 1) + g = b; +} + +void sizepos_i_22 (int b) +{ + if (b >> 22 & 1) + g = b; +} + +void pos_i_8 (int b) +{ + if (b >> 8 & 1) + g = b; +} + +void sizepos_i_13 (int b) +{ + if (b >> 13 & 1) + g = b; +} + +void pos_s_129 (short b) +{ + if (b >> 129 & 1) + g = b; +} + +void sizepos_s_134 (short b) +{ + if (b >> 134 & 1) + g = b; +} + +void pos_s_65 (short b) +{ + if (b >> 65 & 1) + g = b; +} + +void sizepos_s_70 (short b) +{ + if (b >> 70 & 1) + g = b; +} + +void pos_s_33 (short b) +{ + if (b >> 33 & 1) + g = b; +} + +void sizepos_s_38 (short b) +{ + if (b >> 38 & 1) + g = b; +} + +void pos_s_17 (short b) +{ + if (b >> 17 & 1) + g = b; +} + +void sizepos_s_22 (short b) +{ + if (b >> 22 & 1) + g = b; +} + +void pos_s_8 (short b) +{ + if (b >> 8 & 1) + g = b; +} + +void sizepos_s_13 (short b) +{ + if (b >> 13 & 1) + g = b; +} + +void pos_c_129 (signed char b) +{ + if (b >> 129 & 1) + g = b; +} + +void sizepos_c_134 (signed char b) +{ + if (b >> 134 & 1) + g = b; +} + +void pos_c_65 (signed char b) +{ + if (b >> 65 & 1) + g = b; +} + +void sizepos_c_70 (signed char b) +{ + if (b >> 70 & 1) + g = b; +} + +void pos_c_33 (signed char b) +{ + if (b >> 33 & 1) + g = b; +} + +void sizepos_c_38 (signed char b) +{ + if (b >> 38 & 1) + g = b; +} + +void pos_c_17 (signed char b) +{ + if (b >> 17 & 1) + g = b; +} + +void sizepos_c_22 (signed char b) +{ + if (b >> 22 & 1) + g = b; +} + +void pos_c_8 (signed char b) +{ + if (b >> 8 & 1) + g = b; +} + +void sizepos_c_13 (signed char b) +{ + if (b >> 13 & 1) + g = b; +} diff --git a/gcc/testsuite/gcc.target/s390/s390.exp b/gcc/testsuite/gcc.target/s390/s390.exp index f4ad7a1..450dcaf 100644 --- a/gcc/testsuite/gcc.target/s390/s390.exp +++ b/gcc/testsuite/gcc.target/s390/s390.exp @@ -90,16 +90,16 @@ dg-init set md_tests $srcdir/$subdir/md/*.c # Main loop. -dg-runtest [lsort [prune [glob -nocomplain $srcdir/$subdir/*.\[cS\]] \ +dg-runtest [lsort [prune [glob -nocomplain $srcdir/$subdir/*.{c,S,C}] \ $md_tests]] "" $DEFAULT_CFLAGS -dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*vector*/*.\[cS\]]] \ +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*vector*/*.]] \ "" $DEFAULT_CFLAGS -dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/target-attribute/*.\[cS\]]] \ +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/target-attribute/*.{c,S,C}]] \ "" $DEFAULT_CFLAGS -dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/md/*.\[cS\]]] \ +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/md/*.{c,S,C}]] \ "" $DEFAULT_CFLAGS # Additional hotpatch torture tests. -- cgit v1.1 From bba13c0c4359649687894823900e091576cbc2e6 Mon Sep 17 00:00:00 2001 From: Dominik Vogt Date: Fri, 2 Dec 2016 08:31:09 +0000 Subject: S/390: Fix litpool-r3-1.c. gcc/testsuite/ChangeLog: 2016-12-02 Dominik Vogt * gcc.target/s390/litpool-r3-1.c: Fix label number test. From-SVN: r243161 --- gcc/testsuite/ChangeLog | 4 ++++ gcc/testsuite/gcc.target/s390/litpool-r3-1.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 447aaf6..14296a3 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,9 @@ 2016-12-02 Dominik Vogt + * gcc.target/s390/litpool-r3-1.c: Fix label number test. + +2016-12-02 Dominik Vogt + PR target/77822 * gcc.target/s390/s390.exp: Support .C tests. * gcc.target/s390/pr77822-2.c: New test. diff --git a/gcc/testsuite/gcc.target/s390/litpool-r3-1.c b/gcc/testsuite/gcc.target/s390/litpool-r3-1.c index 8ee50cf..91e4807 100644 --- a/gcc/testsuite/gcc.target/s390/litpool-r3-1.c +++ b/gcc/testsuite/gcc.target/s390/litpool-r3-1.c @@ -13,4 +13,4 @@ int foo(void) return c; } -/* { dg-final { scan-assembler-times "\tlarl\t%r3,.L3" 1 } } */ +/* { dg-final { scan-assembler-times "\tlarl\t%r3,.L\[0-9\]" 1 } } */ -- cgit v1.1 From 32ff7e39c15206dada1406bdc06e18db9e02d248 Mon Sep 17 00:00:00 2001 From: Dominik Vogt Date: Fri, 2 Dec 2016 08:32:40 +0000 Subject: Do not simplify "(and (reg) (const bit)" to if_then_else. combine_simplify_rtx() tries to replace rtx expressions with just two possible values with an experession that uses if_then_else: (if_then_else (condition) (value1) (value2)) If the original expression is e.g. (and (reg) (const_int 2)) where the constant is the mask for a single bit, the replacement results in a more complex expression than before: (if_then_else (ne (zero_extract (reg) (1) (31))) (2) (0)) Similar replacements are done for (signextend (and ...)) (zeroextend (and ...)) Suppress the replacement this special case in if_then_else_cond(). gcc/ChangeLog: 2016-12-02 Dominik Vogt * combine.c (combine_simplify_rtx): Suppress replacement of "(and (reg) (const_int bit))" with "if_then_else". From-SVN: r243162 --- gcc/ChangeLog | 5 +++++ gcc/combine.c | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e357932..a3fcd8b 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,10 @@ 2016-12-02 Dominik Vogt + * combine.c (combine_simplify_rtx): Suppress replacement of + "(and (reg) (const_int bit))" with "if_then_else". + +2016-12-02 Dominik Vogt + PR target/77822 * config/s390/s390.md ("extzv") ("*extzv") diff --git a/gcc/combine.c b/gcc/combine.c index b429453..7ba634a 100644 --- a/gcc/combine.c +++ b/gcc/combine.c @@ -5602,6 +5602,18 @@ combine_simplify_rtx (rtx x, machine_mode op0_mode, int in_dest, && OBJECT_P (SUBREG_REG (XEXP (x, 0))))))) { rtx cond, true_rtx, false_rtx; + unsigned HOST_WIDE_INT nz; + + /* If the operation is an AND wrapped in a SIGN_EXTEND or ZERO_EXTEND with + either operand being just a constant single bit value, do nothing since + IF_THEN_ELSE is likely to increase the expression's complexity. */ + if (HWI_COMPUTABLE_MODE_P (mode) + && pow2p_hwi (nz = nonzero_bits (x, mode)) + && ! ((code == SIGN_EXTEND || code == ZERO_EXTEND) + && GET_CODE (XEXP (x, 0)) == AND + && CONST_INT_P (XEXP (XEXP (x, 0), 0)) + && UINTVAL (XEXP (XEXP (x, 0), 0)) == nz)) + return x; cond = if_then_else_cond (x, &true_rtx, &false_rtx); if (cond != 0 -- cgit v1.1 From a4f2895465da4c8856b119a5787b95db345567a9 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Fri, 2 Dec 2016 09:36:01 +0100 Subject: Fix runtime error: left shift of negative value (PR PR ipa/78555 * sreal.c (sreal::to_int): Make absolute value before shifting. (sreal::operator/): Likewise. (sreal_verify_negative_division): New test. (void sreal_c_tests): Call the new test. * sreal.h (sreal::normalize_up): Use new SREAL_ABS and SREAL_SIGN macros. (sreal::normalize_down): Likewise. From-SVN: r243163 --- gcc/ChangeLog | 11 +++++++++++ gcc/sreal.c | 20 +++++++++++++++++--- gcc/sreal.h | 9 +++++---- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a3fcd8b..4453842 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2016-12-02 Martin Liska + + PR ipa/78555 + * sreal.c (sreal::to_int): Make absolute value before shifting. + (sreal::operator/): Likewise. + (sreal_verify_negative_division): New test. + (void sreal_c_tests): Call the new test. + * sreal.h (sreal::normalize_up): Use new SREAL_ABS and + SREAL_SIGN macros. + (sreal::normalize_down): Likewise. + 2016-12-02 Dominik Vogt * combine.c (combine_simplify_rtx): Suppress replacement of diff --git a/gcc/sreal.c b/gcc/sreal.c index 9c43b4e..52e530d 100644 --- a/gcc/sreal.c +++ b/gcc/sreal.c @@ -102,14 +102,14 @@ sreal::shift_right (int s) int64_t sreal::to_int () const { - int64_t sign = m_sig < 0 ? -1 : 1; + int64_t sign = SREAL_SIGN (m_sig); if (m_exp <= -SREAL_BITS) return 0; if (m_exp >= SREAL_PART_BITS) return sign * INTTYPE_MAXIMUM (int64_t); if (m_exp > 0) - return m_sig << m_exp; + return sign * (SREAL_ABS (m_sig) << m_exp); if (m_exp < 0) return m_sig >> -m_exp; return m_sig; @@ -229,7 +229,8 @@ sreal::operator/ (const sreal &other) const { gcc_checking_assert (other.m_sig != 0); sreal r; - r.m_sig = (m_sig << SREAL_PART_BITS) / other.m_sig; + r.m_sig + = SREAL_SIGN (m_sig) * (SREAL_ABS (m_sig) << SREAL_PART_BITS) / other.m_sig; r.m_exp = m_exp - other.m_exp - SREAL_PART_BITS; r.normalize (); return r; @@ -334,6 +335,18 @@ sreal_verify_shifting (void) verify_shifting (values[i]); } +/* Verify division by (of) a negative value. */ + +static void +sreal_verify_negative_division (void) +{ + ASSERT_EQ (sreal (1) / sreal (1), sreal (1)); + ASSERT_EQ (sreal (-1) / sreal (-1), sreal (1)); + ASSERT_EQ (sreal (-1234567) / sreal (-1234567), sreal (1)); + ASSERT_EQ (sreal (-1234567) / sreal (1234567), sreal (-1)); + ASSERT_EQ (sreal (1234567) / sreal (-1234567), sreal (-1)); +} + /* Run all of the selftests within this file. */ void sreal_c_tests () @@ -341,6 +354,7 @@ void sreal_c_tests () sreal_verify_basics (); sreal_verify_arithmetics (); sreal_verify_shifting (); + sreal_verify_negative_division (); } } // namespace selftest diff --git a/gcc/sreal.h b/gcc/sreal.h index ce9cdbb..21f14b0 100644 --- a/gcc/sreal.h +++ b/gcc/sreal.h @@ -31,6 +31,9 @@ along with GCC; see the file COPYING3. If not see #define SREAL_BITS SREAL_PART_BITS +#define SREAL_SIGN(v) (v < 0 ? -1: 1) +#define SREAL_ABS(v) (v < 0 ? -v: v) + /* Structure for holding a simple real number. */ class sreal { @@ -193,7 +196,6 @@ inline sreal operator>> (const sreal &a, int exp) inline void sreal::normalize_up () { - int64_t s = m_sig < 0 ? -1 : 1; unsigned HOST_WIDE_INT sig = absu_hwi (m_sig); int shift = SREAL_PART_BITS - 2 - floor_log2 (sig); @@ -208,7 +210,7 @@ sreal::normalize_up () m_exp = -SREAL_MAX_EXP; sig = 0; } - if (s == -1) + if (SREAL_SIGN (m_sig) == -1) m_sig = -sig; else m_sig = sig; @@ -221,7 +223,6 @@ sreal::normalize_up () inline void sreal::normalize_down () { - int64_t s = m_sig < 0 ? -1 : 1; int last_bit; unsigned HOST_WIDE_INT sig = absu_hwi (m_sig); int shift = floor_log2 (sig) - SREAL_PART_BITS + 2; @@ -246,7 +247,7 @@ sreal::normalize_down () m_exp = SREAL_MAX_EXP; sig = SREAL_MAX_SIG; } - if (s == -1) + if (SREAL_SIGN (m_sig) == -1) m_sig = -sig; else m_sig = sig; -- cgit v1.1 From a717444986a981b21b42fccfd982dcb6ebe42254 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 2 Dec 2016 09:42:12 +0100 Subject: re PR rtl-optimization/78575 (ICE: in trunc_int_for_mode, at explow.c:55 with -O2 -g) PR rtl-optimization/78575 * config/i386/i386.c (timode_scalar_chain::fix_debug_reg_uses): Use DF infrastructure to wrap all V1TImode reg uses into TImode subreg if not already wrapped in a subreg. Make sure df_insn_rescan does not affect further iterations. * gcc.dg/pr78575.c: New test. From-SVN: r243164 --- gcc/ChangeLog | 8 ++++++++ gcc/config/i386/i386.c | 38 ++++++++++++++++++++------------------ gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/gcc.dg/pr78575.c | 16 ++++++++++++++++ 4 files changed, 49 insertions(+), 18 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/pr78575.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 4453842..a83f528 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2016-12-02 Jakub Jelinek + + PR rtl-optimization/78575 + * config/i386/i386.c (timode_scalar_chain::fix_debug_reg_uses): Use + DF infrastructure to wrap all V1TImode reg uses into TImode subreg + if not already wrapped in a subreg. Make sure df_insn_rescan does not + affect further iterations. + 2016-12-02 Martin Liska PR ipa/78555 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index a5f5339..5226e454 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -3831,30 +3831,32 @@ timode_scalar_chain::fix_debug_reg_uses (rtx reg) if (!flag_var_tracking) return; - df_ref ref; - for (ref = DF_REG_USE_CHAIN (REGNO (reg)); - ref; - ref = DF_REF_NEXT_REG (ref)) + df_ref ref, next; + for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next) { rtx_insn *insn = DF_REF_INSN (ref); + /* Make sure the next ref is for a different instruction, + so that we're not affected by the rescan. */ + next = DF_REF_NEXT_REG (ref); + while (next && DF_REF_INSN (next) == insn) + next = DF_REF_NEXT_REG (next); + if (DEBUG_INSN_P (insn)) { /* It may be a debug insn with a TImode variable in register. */ - rtx val = PATTERN (insn); - if (GET_MODE (val) != TImode) - continue; - gcc_assert (GET_CODE (val) == VAR_LOCATION); - rtx loc = PAT_VAR_LOCATION_LOC (val); - /* It may have been converted to TImode already. */ - if (GET_MODE (loc) == TImode) - continue; - gcc_assert (REG_P (loc) - && GET_MODE (loc) == V1TImode); - /* Convert V1TImode register, which has been updated by a SET - insn before, to SUBREG TImode. */ - PAT_VAR_LOCATION_LOC (val) = gen_rtx_SUBREG (TImode, loc, 0); - df_insn_rescan (insn); + bool changed = false; + for (; ref != next; ref = DF_REF_NEXT_REG (ref)) + { + rtx *loc = DF_REF_LOC (ref); + if (REG_P (*loc) && GET_MODE (*loc) == V1TImode) + { + *loc = gen_rtx_SUBREG (TImode, *loc, 0); + changed = true; + } + } + if (changed) + df_insn_rescan (insn); } } } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 14296a3..6ee4b13 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Jakub Jelinek + + PR rtl-optimization/78575 + * gcc.dg/pr78575.c: New test. + 2016-12-02 Dominik Vogt * gcc.target/s390/litpool-r3-1.c: Fix label number test. diff --git a/gcc/testsuite/gcc.dg/pr78575.c b/gcc/testsuite/gcc.dg/pr78575.c new file mode 100644 index 0000000..6b27f10 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr78575.c @@ -0,0 +1,16 @@ +/* PR rtl-optimization/78575 */ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -g -Wno-psabi" } */ + +typedef unsigned __int128 V __attribute__((vector_size(64))); + +V g; + +void +foo (V v) +{ + unsigned __int128 x = 1; + int c = v[1] <= ~x; + v &= v[1]; + g = v; +} -- cgit v1.1 From 60ebe8ce1d029cbff8ef80c967f98ba43d746f3b Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 2 Dec 2016 09:44:42 +0100 Subject: re PR rtl-optimization/78547 (ICE: in loc_cmp, at var-tracking.c:3417 with -Os -g -mstringop-strategy=libcall -freorder-blocks-algorithm=simple) PR rtl-optimization/78547 * emit-rtl.c (unshare_all_rtl): Make sure DECL_RTL and DECL_INCOMING_RTL is not shared. * config/i386/i386.c (convert_scalars_to_vectors): If any insns have been converted, adjust all parameter's DEC_RTL and DECL_INCOMING_RTL back from V1TImode to TImode if the parameters have TImode. * gcc.dg/pr78547.c: New test. From-SVN: r243165 --- gcc/ChangeLog | 8 ++++++++ gcc/config/i386/i386.c | 22 ++++++++++++++++++++++ gcc/emit-rtl.c | 8 ++++++++ gcc/testsuite/ChangeLog | 3 +++ gcc/testsuite/gcc.dg/pr78547.c | 18 ++++++++++++++++++ 5 files changed, 59 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/pr78547.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a83f528..260a66d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,13 @@ 2016-12-02 Jakub Jelinek + PR rtl-optimization/78547 + * emit-rtl.c (unshare_all_rtl): Make sure DECL_RTL and + DECL_INCOMING_RTL is not shared. + * config/i386/i386.c (convert_scalars_to_vectors): If any + insns have been converted, adjust all parameter's DEC_RTL and + DECL_INCOMING_RTL back from V1TImode to TImode if the parameters have + TImode. + PR rtl-optimization/78575 * config/i386/i386.c (timode_scalar_chain::fix_debug_reg_uses): Use DF infrastructure to wrap all V1TImode reg uses into TImode subreg diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5226e454..5678fa2 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -4075,6 +4075,28 @@ convert_scalars_to_vector () crtl->stack_alignment_needed = 128; if (crtl->stack_alignment_estimated < 128) crtl->stack_alignment_estimated = 128; + /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */ + if (TARGET_64BIT) + for (tree parm = DECL_ARGUMENTS (current_function_decl); + parm; parm = DECL_CHAIN (parm)) + { + if (TYPE_MODE (TREE_TYPE (parm)) != TImode) + continue; + if (DECL_RTL_SET_P (parm) + && GET_MODE (DECL_RTL (parm)) == V1TImode) + { + rtx r = DECL_RTL (parm); + if (REG_P (r)) + SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0)); + } + if (DECL_INCOMING_RTL (parm) + && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode) + { + rtx r = DECL_INCOMING_RTL (parm); + if (REG_P (r)) + DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0); + } + } } return 0; diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c index 02512d3..d2ac88b 100644 --- a/gcc/emit-rtl.c +++ b/gcc/emit-rtl.c @@ -2668,6 +2668,14 @@ unsigned int unshare_all_rtl (void) { unshare_all_rtl_1 (get_insns ()); + + for (tree decl = DECL_ARGUMENTS (cfun->decl); decl; decl = DECL_CHAIN (decl)) + { + if (DECL_RTL_SET_P (decl)) + SET_DECL_RTL (decl, copy_rtx_if_shared (DECL_RTL (decl))); + DECL_INCOMING_RTL (decl) = copy_rtx_if_shared (DECL_INCOMING_RTL (decl)); + } + return 0; } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 6ee4b13..490d081 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,8 @@ 2016-12-02 Jakub Jelinek + PR rtl-optimization/78547 + * gcc.dg/pr78547.c: New test. + PR rtl-optimization/78575 * gcc.dg/pr78575.c: New test. diff --git a/gcc/testsuite/gcc.dg/pr78547.c b/gcc/testsuite/gcc.dg/pr78547.c new file mode 100644 index 0000000..9300cbc --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr78547.c @@ -0,0 +1,18 @@ +/* PR rtl-optimization/78547 */ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-Os -g -freorder-blocks-algorithm=simple -Wno-psabi" } */ +/* { dg-additional-options "-mstringop-strategy=libcall" { target i?86-*-* x86_64-*-* } } */ + +typedef unsigned __int128 u128; +typedef unsigned __int128 V __attribute__ ((vector_size (64))); + +V +foo (u128 a, u128 b, u128 c, V d) +{ + V e = (V) {a}; + V f = e & 1; + e = 0 != e; + c = c; + f = f << ((V) {c} & 7); + return f + e; +} -- cgit v1.1 From 46f87aa3e06d9db97f21e25af4be91efc3176000 Mon Sep 17 00:00:00 2001 From: Georg-Johann Lay Date: Fri, 2 Dec 2016 09:05:56 +0000 Subject: invoke.texi (AVR Options): Point to absdata. * doc/invoke.texi (AVR Options) [-mabsdata]: Point to absdata. * doc/extend.texi (AVR Variable Attributes) [progmem]: Hint about linker description to avoid progmem altogether. [absdata]: Point to -mabsdata option. From-SVN: r243170 --- gcc/ChangeLog | 7 +++++++ gcc/doc/extend.texi | 26 ++++++++++++++++++++++++++ gcc/doc/invoke.texi | 3 ++- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 260a66d..d50f546 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2016-12-02 Georg-Johann Lay + + * doc/invoke.texi (AVR Options) [-mabsdata]: Point to absdata. + * doc/extend.texi (AVR Variable Attributes) [progmem]: Hint + about linker description to avoid progmem altogether. + [absdata]: Point to -mabsdata option. + 2016-12-02 Jakub Jelinek PR rtl-optimization/78547 diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index d873403..c40e289 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -5929,6 +5929,30 @@ int read_var (int i) @} @end smallexample +Please notice that on these devices, there is no need for @code{progmem} +at all. Just use an appropriate linker description file like outlined below. + +@smallexample + .text : + @{ ... + @} > text + /* Leave .rodata in flash and add an offset of 0x4000 to all + addresses so that respective objects can be accessed by LD + instructions and open coded C/C++. This means there is no + need for progmem in the source and no overhead by read-only + data in RAM. */ + .rodata ADDR(.text) + SIZEOF (.text) + 0x4000 : + @{ + *(.rodata) + *(.rodata*) + *(.gnu.linkonce.r*) + @} AT> text + /* No more need to put .rodata into .data: + Removed all .rodata entries from .data. */ + .data : + @{ ... +@end smallexample + @end table @item io @@ -6001,6 +6025,8 @@ warning like @end itemize +See also the @option{-mabsdata} @ref{AVR Options,command-line option}. + @end table @node Blackfin Variable Attributes diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 8a0cad7..fd549ec 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -15402,7 +15402,8 @@ GCC supports the following AVR devices and ISAs: Assume that all data in static storage can be accessed by LDS / STS instructions. This option has only an effect on reduced Tiny devices like -ATtiny40. +ATtiny40. See also the @code{absdata} +@ref{AVR Variable Attributes,variable attribute}. @item -maccumulate-args @opindex maccumulate-args -- cgit v1.1 From 45d5091d5cfa98d0a6906b9333e0396344040810 Mon Sep 17 00:00:00 2001 From: Georg-Johann Lay Date: Fri, 2 Dec 2016 09:12:22 +0000 Subject: avr-arch.h (avr_mcu_t): Remove field. * config/avr/avr-arch.h (avr_mcu_t) [n_flash]: Remove field. * config/avr/avr-devices.c (AVR_MCU): Remove N_FLASH macro argument. * config/avr/avr-mcus.def (AVR_MCU): Remove initializer for n_flash. * config/avr/avr.c (avr_set_core_architecture) [avr_n_flash]: Use avr_mcu_types.flash_size to compute default value. * config/avr/gen-avr-mmcu-specs.c (print_mcu) [cc1_n_flash]: Use mcu->flash_size to compute value for spec. From-SVN: r243171 --- gcc/ChangeLog | 10 + gcc/config/avr/avr-arch.h | 3 - gcc/config/avr/avr-devices.c | 6 +- gcc/config/avr/avr-mcus.def | 551 ++++++++++++++++++------------------ gcc/config/avr/avr.c | 2 +- gcc/config/avr/gen-avr-mmcu-specs.c | 4 +- 6 files changed, 291 insertions(+), 285 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d50f546..a0cefa7 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,15 @@ 2016-12-02 Georg-Johann Lay + * config/avr/avr-arch.h (avr_mcu_t) [n_flash]: Remove field. + * config/avr/avr-devices.c (AVR_MCU): Remove N_FLASH macro argument. + * config/avr/avr-mcus.def (AVR_MCU): Remove initializer for n_flash. + * config/avr/avr.c (avr_set_core_architecture) [avr_n_flash]: Use + avr_mcu_types.flash_size to compute default value. + * config/avr/gen-avr-mmcu-specs.c (print_mcu) [cc1_n_flash]: Use + mcu->flash_size to compute value for spec. + +2016-12-02 Georg-Johann Lay + * doc/invoke.texi (AVR Options) [-mabsdata]: Point to absdata. * doc/extend.texi (AVR Variable Attributes) [progmem]: Hint about linker description to avoid progmem altogether. diff --git a/gcc/config/avr/avr-arch.h b/gcc/config/avr/avr-arch.h index e6a2d75..98b394f 100644 --- a/gcc/config/avr/avr-arch.h +++ b/gcc/config/avr/avr-arch.h @@ -120,9 +120,6 @@ typedef struct /* Start of text section. */ int text_section_start; - /* Number of 64k segments in the flash. */ - int n_flash; - /* Flash size in bytes. */ int flash_size; } avr_mcu_t; diff --git a/gcc/config/avr/avr-devices.c b/gcc/config/avr/avr-devices.c index cef3b9a..1bd3e5f 100644 --- a/gcc/config/avr/avr-devices.c +++ b/gcc/config/avr/avr-devices.c @@ -111,12 +111,12 @@ avr_texinfo[] = const avr_mcu_t avr_mcu_types[] = { -#define AVR_MCU(NAME, ARCH, DEV_ATTRIBUTE, MACRO, DATA_SEC, TEXT_SEC, N_FLASH, FLASH_SIZE)\ - { NAME, ARCH, DEV_ATTRIBUTE, MACRO, DATA_SEC, TEXT_SEC, N_FLASH, FLASH_SIZE }, +#define AVR_MCU(NAME, ARCH, DEV_ATTRIBUTE, MACRO, DATA_SEC, TEXT_SEC, FLASH_SIZE)\ + { NAME, ARCH, DEV_ATTRIBUTE, MACRO, DATA_SEC, TEXT_SEC, FLASH_SIZE }, #include "avr-mcus.def" #undef AVR_MCU /* End of list. */ - { NULL, ARCH_UNKNOWN, AVR_ISA_NONE, NULL, 0, 0, 0, 0 } + { NULL, ARCH_UNKNOWN, AVR_ISA_NONE, NULL, 0, 0, 0 } }; diff --git a/gcc/config/avr/avr-mcus.def b/gcc/config/avr/avr-mcus.def index 4008741..aafa224 100644 --- a/gcc/config/avr/avr-mcus.def +++ b/gcc/config/avr/avr-mcus.def @@ -59,301 +59,298 @@ TEXT_START First address of Flash, used in -Ttext=. - N_FLASH Number of 64 KiB flash segments, rounded up. The default - value for -mn-flash=. - FLASH_SIZE Flash size in bytes. "avr2" must be first for the "0" default to work as intended. */ /* Classic, <= 8K. */ -AVR_MCU ("avr2", ARCH_AVR2, AVR_ERRATA_SKIP, NULL, 0x0060, 0x0, 6, 0x2000) +AVR_MCU ("avr2", ARCH_AVR2, AVR_ERRATA_SKIP, NULL, 0x0060, 0x0, 0x60000) -AVR_MCU ("at90s2313", ARCH_AVR2, AVR_SHORT_SP, "__AVR_AT90S2313__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("at90s2323", ARCH_AVR2, AVR_SHORT_SP, "__AVR_AT90S2323__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("at90s2333", ARCH_AVR2, AVR_SHORT_SP, "__AVR_AT90S2333__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("at90s2343", ARCH_AVR2, AVR_SHORT_SP, "__AVR_AT90S2343__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("attiny22", ARCH_AVR2, AVR_SHORT_SP, "__AVR_ATtiny22__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("attiny26", ARCH_AVR2, AVR_SHORT_SP, "__AVR_ATtiny26__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("at90s4414", ARCH_AVR2, AVR_ISA_NONE, "__AVR_AT90S4414__", 0x0060, 0x0, 1, 0x1000) -AVR_MCU ("at90s4433", ARCH_AVR2, AVR_SHORT_SP, "__AVR_AT90S4433__", 0x0060, 0x0, 1, 0x1000) -AVR_MCU ("at90s4434", ARCH_AVR2, AVR_ISA_NONE, "__AVR_AT90S4434__", 0x0060, 0x0, 1, 0x1000) -AVR_MCU ("at90s8515", ARCH_AVR2, AVR_ERRATA_SKIP, "__AVR_AT90S8515__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("at90c8534", ARCH_AVR2, AVR_ISA_NONE, "__AVR_AT90C8534__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("at90s8535", ARCH_AVR2, AVR_ISA_NONE, "__AVR_AT90S8535__", 0x0060, 0x0, 1, 0x2000) +AVR_MCU ("at90s2313", ARCH_AVR2, AVR_SHORT_SP, "__AVR_AT90S2313__", 0x0060, 0x0, 0x800) +AVR_MCU ("at90s2323", ARCH_AVR2, AVR_SHORT_SP, "__AVR_AT90S2323__", 0x0060, 0x0, 0x800) +AVR_MCU ("at90s2333", ARCH_AVR2, AVR_SHORT_SP, "__AVR_AT90S2333__", 0x0060, 0x0, 0x800) +AVR_MCU ("at90s2343", ARCH_AVR2, AVR_SHORT_SP, "__AVR_AT90S2343__", 0x0060, 0x0, 0x800) +AVR_MCU ("attiny22", ARCH_AVR2, AVR_SHORT_SP, "__AVR_ATtiny22__", 0x0060, 0x0, 0x800) +AVR_MCU ("attiny26", ARCH_AVR2, AVR_SHORT_SP, "__AVR_ATtiny26__", 0x0060, 0x0, 0x800) +AVR_MCU ("at90s4414", ARCH_AVR2, AVR_ISA_NONE, "__AVR_AT90S4414__", 0x0060, 0x0, 0x1000) +AVR_MCU ("at90s4433", ARCH_AVR2, AVR_SHORT_SP, "__AVR_AT90S4433__", 0x0060, 0x0, 0x1000) +AVR_MCU ("at90s4434", ARCH_AVR2, AVR_ISA_NONE, "__AVR_AT90S4434__", 0x0060, 0x0, 0x1000) +AVR_MCU ("at90s8515", ARCH_AVR2, AVR_ERRATA_SKIP, "__AVR_AT90S8515__", 0x0060, 0x0, 0x2000) +AVR_MCU ("at90c8534", ARCH_AVR2, AVR_ISA_NONE, "__AVR_AT90C8534__", 0x0060, 0x0, 0x2000) +AVR_MCU ("at90s8535", ARCH_AVR2, AVR_ISA_NONE, "__AVR_AT90S8535__", 0x0060, 0x0, 0x2000) /* Classic + MOVW, <= 8K. */ -AVR_MCU ("avr25", ARCH_AVR25, AVR_ISA_NONE, NULL, 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("ata5272", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATA5272__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("ata6616c", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATA6616C__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("attiny13", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny13__", 0x0060, 0x0, 1, 0x400) -AVR_MCU ("attiny13a", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny13A__", 0x0060, 0x0, 1, 0x400) -AVR_MCU ("attiny2313", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny2313__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("attiny2313a", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny2313A__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("attiny24", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny24__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("attiny24a", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny24A__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("attiny4313", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny4313__", 0x0060, 0x0, 1, 0x1000) -AVR_MCU ("attiny44", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny44__", 0x0060, 0x0, 1, 0x1000) -AVR_MCU ("attiny44a", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny44A__", 0x0060, 0x0, 1, 0x1000) -AVR_MCU ("attiny441", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny441__", 0x0100, 0x0, 1, 0x1000) -AVR_MCU ("attiny84", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny84__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("attiny84a", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny84A__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("attiny25", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny25__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("attiny45", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny45__", 0x0060, 0x0, 1, 0x1000) -AVR_MCU ("attiny85", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny85__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("attiny261", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny261__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("attiny261a", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny261A__", 0x0060, 0x0, 1, 0x800) -AVR_MCU ("attiny461", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny461__", 0x0060, 0x0, 1, 0x1000) -AVR_MCU ("attiny461a", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny461A__", 0x0060, 0x0, 1, 0x1000) -AVR_MCU ("attiny861", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny861__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("attiny861a", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny861A__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("attiny43u", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny43U__", 0x0060, 0x0, 1, 0x1000) -AVR_MCU ("attiny87", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny87__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("attiny48", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny48__", 0x0100, 0x0, 1, 0x1000) -AVR_MCU ("attiny88", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny88__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("attiny828", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny828__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("attiny841", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny841__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("at86rf401", ARCH_AVR25, AVR_ISA_NONE, "__AVR_AT86RF401__", 0x0060, 0x0, 1, 0x800) +AVR_MCU ("avr25", ARCH_AVR25, AVR_ISA_NONE, NULL, 0x0060, 0x0, 0x2000) +AVR_MCU ("ata5272", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATA5272__", 0x0100, 0x0, 0x2000) +AVR_MCU ("ata6616c", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATA6616C__", 0x0100, 0x0, 0x2000) +AVR_MCU ("attiny13", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny13__", 0x0060, 0x0, 0x400) +AVR_MCU ("attiny13a", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny13A__", 0x0060, 0x0, 0x400) +AVR_MCU ("attiny2313", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny2313__", 0x0060, 0x0, 0x800) +AVR_MCU ("attiny2313a", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny2313A__", 0x0060, 0x0, 0x800) +AVR_MCU ("attiny24", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny24__", 0x0060, 0x0, 0x800) +AVR_MCU ("attiny24a", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny24A__", 0x0060, 0x0, 0x800) +AVR_MCU ("attiny4313", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny4313__", 0x0060, 0x0, 0x1000) +AVR_MCU ("attiny44", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny44__", 0x0060, 0x0, 0x1000) +AVR_MCU ("attiny44a", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny44A__", 0x0060, 0x0, 0x1000) +AVR_MCU ("attiny441", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny441__", 0x0100, 0x0, 0x1000) +AVR_MCU ("attiny84", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny84__", 0x0060, 0x0, 0x2000) +AVR_MCU ("attiny84a", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny84A__", 0x0060, 0x0, 0x2000) +AVR_MCU ("attiny25", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny25__", 0x0060, 0x0, 0x800) +AVR_MCU ("attiny45", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny45__", 0x0060, 0x0, 0x1000) +AVR_MCU ("attiny85", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny85__", 0x0060, 0x0, 0x2000) +AVR_MCU ("attiny261", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny261__", 0x0060, 0x0, 0x800) +AVR_MCU ("attiny261a", ARCH_AVR25, AVR_SHORT_SP, "__AVR_ATtiny261A__", 0x0060, 0x0, 0x800) +AVR_MCU ("attiny461", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny461__", 0x0060, 0x0, 0x1000) +AVR_MCU ("attiny461a", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny461A__", 0x0060, 0x0, 0x1000) +AVR_MCU ("attiny861", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny861__", 0x0060, 0x0, 0x2000) +AVR_MCU ("attiny861a", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny861A__", 0x0060, 0x0, 0x2000) +AVR_MCU ("attiny43u", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny43U__", 0x0060, 0x0, 0x1000) +AVR_MCU ("attiny87", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny87__", 0x0100, 0x0, 0x2000) +AVR_MCU ("attiny48", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny48__", 0x0100, 0x0, 0x1000) +AVR_MCU ("attiny88", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny88__", 0x0100, 0x0, 0x2000) +AVR_MCU ("attiny828", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny828__", 0x0100, 0x0, 0x2000) +AVR_MCU ("attiny841", ARCH_AVR25, AVR_ISA_NONE, "__AVR_ATtiny841__", 0x0100, 0x0, 0x2000) +AVR_MCU ("at86rf401", ARCH_AVR25, AVR_ISA_NONE, "__AVR_AT86RF401__", 0x0060, 0x0, 0x800) /* Classic, > 8K, <= 64K. */ -AVR_MCU ("avr3", ARCH_AVR3, AVR_ISA_NONE, NULL, 0x0060, 0x0, 1, 0x6000) -AVR_MCU ("at43usb355", ARCH_AVR3, AVR_ISA_NONE, "__AVR_AT43USB355__", 0x0060, 0x0, 1, 0x6000) -AVR_MCU ("at76c711", ARCH_AVR3, AVR_ISA_NONE, "__AVR_AT76C711__", 0x0060, 0x0, 1, 0x4000) +AVR_MCU ("avr3", ARCH_AVR3, AVR_ISA_NONE, NULL, 0x0060, 0x0, 0x6000) +AVR_MCU ("at43usb355", ARCH_AVR3, AVR_ISA_NONE, "__AVR_AT43USB355__", 0x0060, 0x0, 0x6000) +AVR_MCU ("at76c711", ARCH_AVR3, AVR_ISA_NONE, "__AVR_AT76C711__", 0x0060, 0x0, 0x4000) /* Classic, == 128K. */ -AVR_MCU ("avr31", ARCH_AVR31, AVR_ERRATA_SKIP, NULL, 0x0060, 0x0, 2, 0x20000) -AVR_MCU ("atmega103", ARCH_AVR31, AVR_ERRATA_SKIP, "__AVR_ATmega103__", 0x0060, 0x0, 2, 0x20000) -AVR_MCU ("at43usb320", ARCH_AVR31, AVR_ISA_NONE, "__AVR_AT43USB320__", 0x0060, 0x0, 1, 0x10000) +AVR_MCU ("avr31", ARCH_AVR31, AVR_ERRATA_SKIP, NULL, 0x0060, 0x0, 0x20000) +AVR_MCU ("atmega103", ARCH_AVR31, AVR_ERRATA_SKIP, "__AVR_ATmega103__", 0x0060, 0x0, 0x20000) +AVR_MCU ("at43usb320", ARCH_AVR31, AVR_ISA_NONE, "__AVR_AT43USB320__", 0x0060, 0x0, 0x10000) /* Classic + MOVW + JMP/CALL. */ -AVR_MCU ("avr35", ARCH_AVR35, AVR_ISA_NONE, NULL, 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("ata5505", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATA5505__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("ata6617c", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATA6617C__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("ata664251", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATA664251__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("at90usb82", ARCH_AVR35, AVR_ISA_NONE, "__AVR_AT90USB82__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("at90usb162", ARCH_AVR35, AVR_ISA_NONE, "__AVR_AT90USB162__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega8u2", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATmega8U2__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("atmega16u2", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATmega16U2__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega32u2", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATmega32U2__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("attiny167", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATtiny167__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("attiny1634", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATtiny1634__", 0x0100, 0x0, 1, 0x4000) +AVR_MCU ("avr35", ARCH_AVR35, AVR_ISA_NONE, NULL, 0x0100, 0x0, 0x4000) +AVR_MCU ("ata5505", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATA5505__", 0x0100, 0x0, 0x4000) +AVR_MCU ("ata6617c", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATA6617C__", 0x0100, 0x0, 0x4000) +AVR_MCU ("ata664251", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATA664251__", 0x0100, 0x0, 0x4000) +AVR_MCU ("at90usb82", ARCH_AVR35, AVR_ISA_NONE, "__AVR_AT90USB82__", 0x0100, 0x0, 0x2000) +AVR_MCU ("at90usb162", ARCH_AVR35, AVR_ISA_NONE, "__AVR_AT90USB162__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega8u2", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATmega8U2__", 0x0100, 0x0, 0x2000) +AVR_MCU ("atmega16u2", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATmega16U2__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega32u2", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATmega32U2__", 0x0100, 0x0, 0x8000) +AVR_MCU ("attiny167", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATtiny167__", 0x0100, 0x0, 0x4000) +AVR_MCU ("attiny1634", ARCH_AVR35, AVR_ISA_NONE, "__AVR_ATtiny1634__", 0x0100, 0x0, 0x4000) /* Enhanced, <= 8K. */ -AVR_MCU ("avr4", ARCH_AVR4, AVR_ISA_NONE, NULL, 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("ata6285", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATA6285__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("ata6286", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATA6286__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("ata6289", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATA6289__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("ata6612c", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATA6612C__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("atmega8", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega8__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("atmega8a", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega8A__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("atmega48", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega48__", 0x0100, 0x0, 1, 0x1000) -AVR_MCU ("atmega48a", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega48A__", 0x0100, 0x0, 1, 0x1000) -AVR_MCU ("atmega48p", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega48P__", 0x0100, 0x0, 1, 0x1000) -AVR_MCU ("atmega48pa", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega48PA__", 0x0100, 0x0, 1, 0x1000) -AVR_MCU ("atmega48pb", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega48PB__", 0x0100, 0x0, 1, 0x1000) -AVR_MCU ("atmega88", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega88__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("atmega88a", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega88A__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("atmega88p", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega88P__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("atmega88pa", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega88PA__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("atmega88pb", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega88PB__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("atmega8515", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega8515__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("atmega8535", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega8535__", 0x0060, 0x0, 1, 0x2000) -AVR_MCU ("atmega8hva", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega8HVA__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("at90pwm1", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM1__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("at90pwm2", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM2__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("at90pwm2b", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM2B__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("at90pwm3", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM3__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("at90pwm3b", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM3B__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("at90pwm81", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM81__", 0x0100, 0x0, 1, 0x2000) +AVR_MCU ("avr4", ARCH_AVR4, AVR_ISA_NONE, NULL, 0x0060, 0x0, 0x2000) +AVR_MCU ("ata6285", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATA6285__", 0x0100, 0x0, 0x2000) +AVR_MCU ("ata6286", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATA6286__", 0x0100, 0x0, 0x2000) +AVR_MCU ("ata6289", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATA6289__", 0x0100, 0x0, 0x2000) +AVR_MCU ("ata6612c", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATA6612C__", 0x0100, 0x0, 0x2000) +AVR_MCU ("atmega8", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega8__", 0x0060, 0x0, 0x2000) +AVR_MCU ("atmega8a", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega8A__", 0x0060, 0x0, 0x2000) +AVR_MCU ("atmega48", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega48__", 0x0100, 0x0, 0x1000) +AVR_MCU ("atmega48a", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega48A__", 0x0100, 0x0, 0x1000) +AVR_MCU ("atmega48p", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega48P__", 0x0100, 0x0, 0x1000) +AVR_MCU ("atmega48pa", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega48PA__", 0x0100, 0x0, 0x1000) +AVR_MCU ("atmega48pb", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega48PB__", 0x0100, 0x0, 0x1000) +AVR_MCU ("atmega88", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega88__", 0x0100, 0x0, 0x2000) +AVR_MCU ("atmega88a", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega88A__", 0x0100, 0x0, 0x2000) +AVR_MCU ("atmega88p", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega88P__", 0x0100, 0x0, 0x2000) +AVR_MCU ("atmega88pa", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega88PA__", 0x0100, 0x0, 0x2000) +AVR_MCU ("atmega88pb", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega88PB__", 0x0100, 0x0, 0x2000) +AVR_MCU ("atmega8515", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega8515__", 0x0060, 0x0, 0x2000) +AVR_MCU ("atmega8535", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega8535__", 0x0060, 0x0, 0x2000) +AVR_MCU ("atmega8hva", ARCH_AVR4, AVR_ISA_NONE, "__AVR_ATmega8HVA__", 0x0100, 0x0, 0x2000) +AVR_MCU ("at90pwm1", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM1__", 0x0100, 0x0, 0x2000) +AVR_MCU ("at90pwm2", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM2__", 0x0100, 0x0, 0x2000) +AVR_MCU ("at90pwm2b", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM2B__", 0x0100, 0x0, 0x2000) +AVR_MCU ("at90pwm3", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM3__", 0x0100, 0x0, 0x2000) +AVR_MCU ("at90pwm3b", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM3B__", 0x0100, 0x0, 0x2000) +AVR_MCU ("at90pwm81", ARCH_AVR4, AVR_ISA_NONE, "__AVR_AT90PWM81__", 0x0100, 0x0, 0x2000) /* Enhanced, > 8K, <= 64K. */ -AVR_MCU ("avr5", ARCH_AVR5, AVR_ISA_NONE, NULL, 0x0060, 0x0, 1, 0x4000) -AVR_MCU ("ata5702m322", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5702M322__", 0x0200, 0x0, 1, 0x10000) -AVR_MCU ("ata5782", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5782__", 0x0200, 0x8000, 1, 0xd000) -AVR_MCU ("ata5790", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5790__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("ata5790n", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5790N__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("ata5791", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5791__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("ata5795", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5795__", 0x0100, 0x0, 1, 0x2000) -AVR_MCU ("ata5831", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5831__", 0x0200, 0x8000, 1, 0xd000) -AVR_MCU ("ata6613c", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA6613C__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("ata6614q", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA6614Q__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("ata8210", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA8210__", 0x0200, 0x8000, 1, 0xd000) -AVR_MCU ("ata8510", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA8510__", 0x0200, 0x8000, 1, 0xd000) -AVR_MCU ("atmega16", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16__", 0x0060, 0x0, 1, 0x4000) -AVR_MCU ("atmega16a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16A__", 0x0060, 0x0, 1, 0x4000) -AVR_MCU ("atmega161", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega161__", 0x0060, 0x0, 1, 0x4000) -AVR_MCU ("atmega162", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega162__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega163", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega163__", 0x0060, 0x0, 1, 0x4000) -AVR_MCU ("atmega164a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega164A__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega164p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega164P__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega164pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega164PA__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega165", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega165__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega165a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega165A__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega165p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega165P__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega165pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega165PA__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega168", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega168__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega168a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega168A__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega168p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega168P__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega168pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega168PA__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega168pb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega168PB__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega169", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega169__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega169a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega169A__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega169p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega169P__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega169pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega169PA__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega16hvb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16HVB__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega16hvbrevb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16HVBREVB__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega16m1", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16M1__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega16u4", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16U4__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega32a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32A__", 0x0060, 0x0, 1, 0x8000) -AVR_MCU ("atmega32", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32__", 0x0060, 0x0, 1, 0x8000) -AVR_MCU ("atmega323", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega323__", 0x0060, 0x0, 1, 0x8000) -AVR_MCU ("atmega324a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega324A__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega324p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega324P__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega324pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega324PA__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega325", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega325__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega325a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega325A__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega325p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega325P__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega325pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega325PA__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega3250", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3250__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega3250a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3250A__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega3250p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3250P__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega3250pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3250PA__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega328", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega328__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega328p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega328P__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega328pb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega328PB__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega329", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega329__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega329a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega329A__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega329p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega329P__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega329pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega329PA__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega3290", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3290__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega3290a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3290A__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega3290p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3290P__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega3290pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3290PA__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega32c1", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32C1__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega32m1", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32M1__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega32u4", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32U4__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega32u6", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32U6__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega406", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega406__", 0x0100, 0x0, 1, 0xa000) -AVR_MCU ("atmega64", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega64a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64A__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega640", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega640__", 0x0200, 0x0, 1, 0x10000) -AVR_MCU ("atmega644", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega644__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega644a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega644A__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega644p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega644P__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega644pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega644PA__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega645", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega645__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega645a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega645A__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega645p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega645P__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega6450", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6450__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega6450a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6450A__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega6450p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6450P__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega649", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega649__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega649a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega649A__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega649p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega649P__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega6490", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6490__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega16hva", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16HVA__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega16hva2", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16HVA2__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("atmega32hvb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32HVB__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("atmega6490a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6490A__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega6490p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6490P__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega64c1", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64C1__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega64m1", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64M1__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega64hve", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64HVE__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega64hve2", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64HVE2__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("atmega64rfr2", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64RFR2__", 0x0200, 0x0, 1, 0x10000) -AVR_MCU ("atmega644rfr2", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega644RFR2__", 0x0200, 0x0, 1, 0x10000) -AVR_MCU ("atmega32hvbrevb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32HVBREVB__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("at90can32", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90CAN32__", 0x0100, 0x0, 1, 0x8000) -AVR_MCU ("at90can64", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90CAN64__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("at90pwm161", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90PWM161__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("at90pwm216", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90PWM216__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("at90pwm316", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90PWM316__", 0x0100, 0x0, 1, 0x4000) -AVR_MCU ("at90scr100", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90SCR100__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("at90usb646", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90USB646__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("at90usb647", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90USB647__", 0x0100, 0x0, 1, 0x10000) -AVR_MCU ("at94k", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT94K__", 0x0060, 0x0, 1, 0x8000) -AVR_MCU ("m3000", ARCH_AVR5, AVR_ISA_NONE, "__AVR_M3000__", 0x1000, 0x0, 1, 0x10000) +AVR_MCU ("avr5", ARCH_AVR5, AVR_ISA_NONE, NULL, 0x0060, 0x0, 0x4000) +AVR_MCU ("ata5702m322", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5702M322__", 0x0200, 0x0, 0x10000) +AVR_MCU ("ata5782", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5782__", 0x0200, 0x8000, 0xd000) +AVR_MCU ("ata5790", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5790__", 0x0100, 0x0, 0x4000) +AVR_MCU ("ata5790n", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5790N__", 0x0100, 0x0, 0x4000) +AVR_MCU ("ata5791", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5791__", 0x0100, 0x0, 0x4000) +AVR_MCU ("ata5795", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5795__", 0x0100, 0x0, 0x2000) +AVR_MCU ("ata5831", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA5831__", 0x0200, 0x8000, 0xd000) +AVR_MCU ("ata6613c", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA6613C__", 0x0100, 0x0, 0x4000) +AVR_MCU ("ata6614q", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA6614Q__", 0x0100, 0x0, 0x8000) +AVR_MCU ("ata8210", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA8210__", 0x0200, 0x8000, 0xd000) +AVR_MCU ("ata8510", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATA8510__", 0x0200, 0x8000, 0xd000) +AVR_MCU ("atmega16", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16__", 0x0060, 0x0, 0x4000) +AVR_MCU ("atmega16a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16A__", 0x0060, 0x0, 0x4000) +AVR_MCU ("atmega161", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega161__", 0x0060, 0x0, 0x4000) +AVR_MCU ("atmega162", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega162__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega163", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega163__", 0x0060, 0x0, 0x4000) +AVR_MCU ("atmega164a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega164A__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega164p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega164P__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega164pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega164PA__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega165", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega165__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega165a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega165A__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega165p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega165P__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega165pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega165PA__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega168", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega168__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega168a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega168A__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega168p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega168P__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega168pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega168PA__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega168pb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega168PB__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega169", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega169__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega169a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega169A__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega169p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega169P__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega169pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega169PA__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega16hvb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16HVB__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega16hvbrevb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16HVBREVB__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega16m1", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16M1__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega16u4", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16U4__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega32a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32A__", 0x0060, 0x0, 0x8000) +AVR_MCU ("atmega32", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32__", 0x0060, 0x0, 0x8000) +AVR_MCU ("atmega323", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega323__", 0x0060, 0x0, 0x8000) +AVR_MCU ("atmega324a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega324A__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega324p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega324P__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega324pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega324PA__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega325", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega325__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega325a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega325A__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega325p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega325P__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega325pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega325PA__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega3250", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3250__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega3250a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3250A__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega3250p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3250P__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega3250pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3250PA__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega328", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega328__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega328p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega328P__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega328pb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega328PB__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega329", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega329__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega329a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega329A__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega329p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega329P__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega329pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega329PA__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega3290", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3290__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega3290a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3290A__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega3290p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3290P__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega3290pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega3290PA__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega32c1", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32C1__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega32m1", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32M1__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega32u4", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32U4__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega32u6", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32U6__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega406", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega406__", 0x0100, 0x0, 0xa000) +AVR_MCU ("atmega64", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega64a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64A__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega640", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega640__", 0x0200, 0x0, 0x10000) +AVR_MCU ("atmega644", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega644__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega644a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega644A__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega644p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega644P__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega644pa", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega644PA__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega645", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega645__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega645a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega645A__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega645p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega645P__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega6450", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6450__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega6450a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6450A__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega6450p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6450P__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega649", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega649__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega649a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega649A__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega649p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega649P__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega6490", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6490__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega16hva", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16HVA__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega16hva2", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega16HVA2__", 0x0100, 0x0, 0x4000) +AVR_MCU ("atmega32hvb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32HVB__", 0x0100, 0x0, 0x8000) +AVR_MCU ("atmega6490a", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6490A__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega6490p", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega6490P__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega64c1", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64C1__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega64m1", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64M1__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega64hve", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64HVE__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega64hve2", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64HVE2__", 0x0100, 0x0, 0x10000) +AVR_MCU ("atmega64rfr2", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega64RFR2__", 0x0200, 0x0, 0x10000) +AVR_MCU ("atmega644rfr2", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega644RFR2__", 0x0200, 0x0, 0x10000) +AVR_MCU ("atmega32hvbrevb", ARCH_AVR5, AVR_ISA_NONE, "__AVR_ATmega32HVBREVB__", 0x0100, 0x0, 0x8000) +AVR_MCU ("at90can32", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90CAN32__", 0x0100, 0x0, 0x8000) +AVR_MCU ("at90can64", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90CAN64__", 0x0100, 0x0, 0x10000) +AVR_MCU ("at90pwm161", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90PWM161__", 0x0100, 0x0, 0x4000) +AVR_MCU ("at90pwm216", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90PWM216__", 0x0100, 0x0, 0x4000) +AVR_MCU ("at90pwm316", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90PWM316__", 0x0100, 0x0, 0x4000) +AVR_MCU ("at90scr100", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90SCR100__", 0x0100, 0x0, 0x10000) +AVR_MCU ("at90usb646", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90USB646__", 0x0100, 0x0, 0x10000) +AVR_MCU ("at90usb647", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT90USB647__", 0x0100, 0x0, 0x10000) +AVR_MCU ("at94k", ARCH_AVR5, AVR_ISA_NONE, "__AVR_AT94K__", 0x0060, 0x0, 0x8000) +AVR_MCU ("m3000", ARCH_AVR5, AVR_ISA_NONE, "__AVR_M3000__", 0x1000, 0x0, 0x10000) /* Enhanced, == 128K. */ -AVR_MCU ("avr51", ARCH_AVR51, AVR_ISA_NONE, NULL, 0x0100, 0x0, 2, 0x20000) -AVR_MCU ("atmega128", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega128__", 0x0100, 0x0, 2, 0x20000) -AVR_MCU ("atmega128a", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega128A__", 0x0100, 0x0, 2, 0x20000) -AVR_MCU ("atmega1280", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega1280__", 0x0200, 0x0, 2, 0x20000) -AVR_MCU ("atmega1281", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega1281__", 0x0200, 0x0, 2, 0x20000) -AVR_MCU ("atmega1284", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega1284__", 0x0100, 0x0, 2, 0x20000) -AVR_MCU ("atmega1284p", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega1284P__", 0x0100, 0x0, 2, 0x20000) -AVR_MCU ("atmega128rfa1", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega128RFA1__", 0x0200, 0x0, 2, 0x20000) -AVR_MCU ("atmega128rfr2", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega128RFR2__", 0x0200, 0x0, 2, 0x20000) -AVR_MCU ("atmega1284rfr2", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega1284RFR2__", 0x0200, 0x0, 2, 0x20000) -AVR_MCU ("at90can128", ARCH_AVR51, AVR_ISA_NONE, "__AVR_AT90CAN128__", 0x0100, 0x0, 2, 0x20000) -AVR_MCU ("at90usb1286", ARCH_AVR51, AVR_ISA_NONE, "__AVR_AT90USB1286__", 0x0100, 0x0, 2, 0x20000) -AVR_MCU ("at90usb1287", ARCH_AVR51, AVR_ISA_NONE, "__AVR_AT90USB1287__", 0x0100, 0x0, 2, 0x20000) +AVR_MCU ("avr51", ARCH_AVR51, AVR_ISA_NONE, NULL, 0x0100, 0x0, 0x20000) +AVR_MCU ("atmega128", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega128__", 0x0100, 0x0, 0x20000) +AVR_MCU ("atmega128a", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega128A__", 0x0100, 0x0, 0x20000) +AVR_MCU ("atmega1280", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega1280__", 0x0200, 0x0, 0x20000) +AVR_MCU ("atmega1281", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega1281__", 0x0200, 0x0, 0x20000) +AVR_MCU ("atmega1284", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega1284__", 0x0100, 0x0, 0x20000) +AVR_MCU ("atmega1284p", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega1284P__", 0x0100, 0x0, 0x20000) +AVR_MCU ("atmega128rfa1", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega128RFA1__", 0x0200, 0x0, 0x20000) +AVR_MCU ("atmega128rfr2", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega128RFR2__", 0x0200, 0x0, 0x20000) +AVR_MCU ("atmega1284rfr2", ARCH_AVR51, AVR_ISA_NONE, "__AVR_ATmega1284RFR2__", 0x0200, 0x0, 0x20000) +AVR_MCU ("at90can128", ARCH_AVR51, AVR_ISA_NONE, "__AVR_AT90CAN128__", 0x0100, 0x0, 0x20000) +AVR_MCU ("at90usb1286", ARCH_AVR51, AVR_ISA_NONE, "__AVR_AT90USB1286__", 0x0100, 0x0, 0x20000) +AVR_MCU ("at90usb1287", ARCH_AVR51, AVR_ISA_NONE, "__AVR_AT90USB1287__", 0x0100, 0x0, 0x20000) /* 3-Byte PC. */ -AVR_MCU ("avr6", ARCH_AVR6, AVR_ISA_NONE, NULL, 0x0200, 0x0, 4, 0x40000) -AVR_MCU ("atmega2560", ARCH_AVR6, AVR_ISA_NONE, "__AVR_ATmega2560__", 0x0200, 0x0, 4, 0x40000) -AVR_MCU ("atmega2561", ARCH_AVR6, AVR_ISA_NONE, "__AVR_ATmega2561__", 0x0200, 0x0, 4, 0x40000) -AVR_MCU ("atmega256rfr2", ARCH_AVR6, AVR_ISA_NONE, "__AVR_ATmega256RFR2__", 0x0200, 0x0, 4, 0x40000) -AVR_MCU ("atmega2564rfr2", ARCH_AVR6, AVR_ISA_NONE, "__AVR_ATmega2564RFR2__", 0x0200, 0x0, 4, 0x40000) +AVR_MCU ("avr6", ARCH_AVR6, AVR_ISA_NONE, NULL, 0x0200, 0x0, 0x40000) +AVR_MCU ("atmega2560", ARCH_AVR6, AVR_ISA_NONE, "__AVR_ATmega2560__", 0x0200, 0x0, 0x40000) +AVR_MCU ("atmega2561", ARCH_AVR6, AVR_ISA_NONE, "__AVR_ATmega2561__", 0x0200, 0x0, 0x40000) +AVR_MCU ("atmega256rfr2", ARCH_AVR6, AVR_ISA_NONE, "__AVR_ATmega256RFR2__", 0x0200, 0x0, 0x40000) +AVR_MCU ("atmega2564rfr2", ARCH_AVR6, AVR_ISA_NONE, "__AVR_ATmega2564RFR2__", 0x0200, 0x0, 0x40000) /* Xmega, 16K <= Flash < 64K, RAM <= 64K */ -AVR_MCU ("avrxmega2", ARCH_AVRXMEGA2, AVR_ISA_NONE, NULL, 0x2000, 0x0, 1, 0x9000) -AVR_MCU ("atxmega8e5", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega8E5__", 0x2000, 0x0, 1, 0x2800) -AVR_MCU ("atxmega16a4", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega16A4__", 0x2000, 0x0, 1, 0x5000) -AVR_MCU ("atxmega16d4", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega16D4__", 0x2000, 0x0, 1, 0x5000) -AVR_MCU ("atxmega16e5", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega16E5__", 0x2000, 0x0, 1, 0x5000) -AVR_MCU ("atxmega32a4", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega32A4__", 0x2000, 0x0, 1, 0x9000) -AVR_MCU ("atxmega32c3", ARCH_AVRXMEGA2, AVR_ISA_RMW, "__AVR_ATxmega32C3__", 0x2000, 0x0, 1, 0x9000) -AVR_MCU ("atxmega32d3", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega32D3__", 0x2000, 0x0, 1, 0x9000) -AVR_MCU ("atxmega32d4", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega32D4__", 0x2000, 0x0, 1, 0x9000) -AVR_MCU ("atxmega16a4u", ARCH_AVRXMEGA2, AVR_ISA_RMW, "__AVR_ATxmega16A4U__", 0x2000, 0x0, 1, 0x5000) -AVR_MCU ("atxmega16c4", ARCH_AVRXMEGA2, AVR_ISA_RMW, "__AVR_ATxmega16C4__", 0x2000, 0x0, 1, 0x5000) -AVR_MCU ("atxmega32a4u", ARCH_AVRXMEGA2, AVR_ISA_RMW, "__AVR_ATxmega32A4U__", 0x2000, 0x0, 1, 0x9000) -AVR_MCU ("atxmega32c4", ARCH_AVRXMEGA2, AVR_ISA_RMW, "__AVR_ATxmega32C4__", 0x2000, 0x0, 1, 0x9000) -AVR_MCU ("atxmega32e5", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega32E5__", 0x2000, 0x0, 1, 0x9000) +AVR_MCU ("avrxmega2", ARCH_AVRXMEGA2, AVR_ISA_NONE, NULL, 0x2000, 0x0, 0x9000) +AVR_MCU ("atxmega8e5", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega8E5__", 0x2000, 0x0, 0x2800) +AVR_MCU ("atxmega16a4", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega16A4__", 0x2000, 0x0, 0x5000) +AVR_MCU ("atxmega16d4", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega16D4__", 0x2000, 0x0, 0x5000) +AVR_MCU ("atxmega16e5", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega16E5__", 0x2000, 0x0, 0x5000) +AVR_MCU ("atxmega32a4", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega32A4__", 0x2000, 0x0, 0x9000) +AVR_MCU ("atxmega32c3", ARCH_AVRXMEGA2, AVR_ISA_RMW, "__AVR_ATxmega32C3__", 0x2000, 0x0, 0x9000) +AVR_MCU ("atxmega32d3", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega32D3__", 0x2000, 0x0, 0x9000) +AVR_MCU ("atxmega32d4", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega32D4__", 0x2000, 0x0, 0x9000) +AVR_MCU ("atxmega16a4u", ARCH_AVRXMEGA2, AVR_ISA_RMW, "__AVR_ATxmega16A4U__", 0x2000, 0x0, 0x5000) +AVR_MCU ("atxmega16c4", ARCH_AVRXMEGA2, AVR_ISA_RMW, "__AVR_ATxmega16C4__", 0x2000, 0x0, 0x5000) +AVR_MCU ("atxmega32a4u", ARCH_AVRXMEGA2, AVR_ISA_RMW, "__AVR_ATxmega32A4U__", 0x2000, 0x0, 0x9000) +AVR_MCU ("atxmega32c4", ARCH_AVRXMEGA2, AVR_ISA_RMW, "__AVR_ATxmega32C4__", 0x2000, 0x0, 0x9000) +AVR_MCU ("atxmega32e5", ARCH_AVRXMEGA2, AVR_ISA_NONE, "__AVR_ATxmega32E5__", 0x2000, 0x0, 0x9000) /* Xmega, 64K < Flash <= 128K, RAM <= 64K */ -AVR_MCU ("avrxmega4", ARCH_AVRXMEGA4, AVR_ISA_NONE, NULL, 0x2000, 0x0, 2, 0x11000) -AVR_MCU ("atxmega64a3", ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_ATxmega64A3__", 0x2000, 0x0, 2, 0x11000) -AVR_MCU ("atxmega64d3", ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_ATxmega64D3__", 0x2000, 0x0, 2, 0x11000) -AVR_MCU ("atxmega64a3u", ARCH_AVRXMEGA4, AVR_ISA_RMW, "__AVR_ATxmega64A3U__", 0x2000, 0x0, 2, 0x11000) -AVR_MCU ("atxmega64a4u", ARCH_AVRXMEGA4, AVR_ISA_RMW, "__AVR_ATxmega64A4U__", 0x2000, 0x0, 2, 0x11000) -AVR_MCU ("atxmega64b1", ARCH_AVRXMEGA4, AVR_ISA_RMW, "__AVR_ATxmega64B1__", 0x2000, 0x0, 2, 0x11000) -AVR_MCU ("atxmega64b3", ARCH_AVRXMEGA4, AVR_ISA_RMW, "__AVR_ATxmega64B3__", 0x2000, 0x0, 2, 0x11000) -AVR_MCU ("atxmega64c3", ARCH_AVRXMEGA4, AVR_ISA_RMW, "__AVR_ATxmega64C3__", 0x2000, 0x0, 2, 0x11000) -AVR_MCU ("atxmega64d4", ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_ATxmega64D4__", 0x2000, 0x0, 2, 0x11000) +AVR_MCU ("avrxmega4", ARCH_AVRXMEGA4, AVR_ISA_NONE, NULL, 0x2000, 0x0, 0x11000) +AVR_MCU ("atxmega64a3", ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_ATxmega64A3__", 0x2000, 0x0, 0x11000) +AVR_MCU ("atxmega64d3", ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_ATxmega64D3__", 0x2000, 0x0, 0x11000) +AVR_MCU ("atxmega64a3u", ARCH_AVRXMEGA4, AVR_ISA_RMW, "__AVR_ATxmega64A3U__", 0x2000, 0x0, 0x11000) +AVR_MCU ("atxmega64a4u", ARCH_AVRXMEGA4, AVR_ISA_RMW, "__AVR_ATxmega64A4U__", 0x2000, 0x0, 0x11000) +AVR_MCU ("atxmega64b1", ARCH_AVRXMEGA4, AVR_ISA_RMW, "__AVR_ATxmega64B1__", 0x2000, 0x0, 0x11000) +AVR_MCU ("atxmega64b3", ARCH_AVRXMEGA4, AVR_ISA_RMW, "__AVR_ATxmega64B3__", 0x2000, 0x0, 0x11000) +AVR_MCU ("atxmega64c3", ARCH_AVRXMEGA4, AVR_ISA_RMW, "__AVR_ATxmega64C3__", 0x2000, 0x0, 0x11000) +AVR_MCU ("atxmega64d4", ARCH_AVRXMEGA4, AVR_ISA_NONE, "__AVR_ATxmega64D4__", 0x2000, 0x0, 0x11000) /* Xmega, 64K < Flash <= 128K, RAM > 64K */ -AVR_MCU ("avrxmega5", ARCH_AVRXMEGA5, AVR_ISA_NONE, NULL, 0x2000, 0x0, 2, 0x11000) -AVR_MCU ("atxmega64a1", ARCH_AVRXMEGA5, AVR_ISA_NONE, "__AVR_ATxmega64A1__", 0x2000, 0x0, 2, 0x11000) -AVR_MCU ("atxmega64a1u", ARCH_AVRXMEGA5, AVR_ISA_RMW, "__AVR_ATxmega64A1U__", 0x2000, 0x0, 2, 0x11000) +AVR_MCU ("avrxmega5", ARCH_AVRXMEGA5, AVR_ISA_NONE, NULL, 0x2000, 0x0, 0x11000) +AVR_MCU ("atxmega64a1", ARCH_AVRXMEGA5, AVR_ISA_NONE, "__AVR_ATxmega64A1__", 0x2000, 0x0, 0x11000) +AVR_MCU ("atxmega64a1u", ARCH_AVRXMEGA5, AVR_ISA_RMW, "__AVR_ATxmega64A1U__", 0x2000, 0x0, 0x11000) /* Xmega, 128K < Flash, RAM <= 64K */ -AVR_MCU ("avrxmega6", ARCH_AVRXMEGA6, AVR_ISA_NONE, NULL, 0x2000, 0x0, 6, 0x60000) -AVR_MCU ("atxmega128a3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega128A3__", 0x2000, 0x0, 3, 0x22000) -AVR_MCU ("atxmega128d3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega128D3__", 0x2000, 0x0, 3, 0x22000) -AVR_MCU ("atxmega192a3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega192A3__", 0x2000, 0x0, 4, 0x32000) -AVR_MCU ("atxmega192d3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega192D3__", 0x2000, 0x0, 4, 0x32000) -AVR_MCU ("atxmega256a3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega256A3__", 0x2000, 0x0, 5, 0x42000) -AVR_MCU ("atxmega256a3b", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega256A3B__", 0x2000, 0x0, 5, 0x42000) -AVR_MCU ("atxmega256a3bu", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega256A3BU__", 0x2000, 0x0, 5, 0x42000) -AVR_MCU ("atxmega256d3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega256D3__", 0x2000, 0x0, 5, 0x42000) -AVR_MCU ("atxmega128a3u", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega128A3U__", 0x2000, 0x0, 3, 0x22000) -AVR_MCU ("atxmega128b1", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega128B1__", 0x2000, 0x0, 3, 0x22000) -AVR_MCU ("atxmega128b3", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega128B3__", 0x2000, 0x0, 3, 0x22000) -AVR_MCU ("atxmega128c3", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega128C3__", 0x2000, 0x0, 3, 0x22000) -AVR_MCU ("atxmega128d4", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega128D4__", 0x2000, 0x0, 3, 0x22000) -AVR_MCU ("atxmega192a3u", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega192A3U__", 0x2000, 0x0, 4, 0x32000) -AVR_MCU ("atxmega192c3", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega192C3__", 0x2000, 0x0, 4, 0x32000) -AVR_MCU ("atxmega256a3u", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega256A3U__", 0x2000, 0x0, 5, 0x42000) -AVR_MCU ("atxmega256c3", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega256C3__", 0x2000, 0x0, 5, 0x42000) -AVR_MCU ("atxmega384c3", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega384C3__", 0x2000, 0x0, 7, 0x62000) -AVR_MCU ("atxmega384d3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega384D3__", 0x2000, 0x0, 7, 0x62000) +AVR_MCU ("avrxmega6", ARCH_AVRXMEGA6, AVR_ISA_NONE, NULL, 0x2000, 0x0, 0x60000) +AVR_MCU ("atxmega128a3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega128A3__", 0x2000, 0x0, 0x22000) +AVR_MCU ("atxmega128d3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega128D3__", 0x2000, 0x0, 0x22000) +AVR_MCU ("atxmega192a3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega192A3__", 0x2000, 0x0, 0x32000) +AVR_MCU ("atxmega192d3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega192D3__", 0x2000, 0x0, 0x32000) +AVR_MCU ("atxmega256a3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega256A3__", 0x2000, 0x0, 0x42000) +AVR_MCU ("atxmega256a3b", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega256A3B__", 0x2000, 0x0, 0x42000) +AVR_MCU ("atxmega256a3bu", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega256A3BU__", 0x2000, 0x0, 0x42000) +AVR_MCU ("atxmega256d3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega256D3__", 0x2000, 0x0, 0x42000) +AVR_MCU ("atxmega128a3u", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega128A3U__", 0x2000, 0x0, 0x22000) +AVR_MCU ("atxmega128b1", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega128B1__", 0x2000, 0x0, 0x22000) +AVR_MCU ("atxmega128b3", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega128B3__", 0x2000, 0x0, 0x22000) +AVR_MCU ("atxmega128c3", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega128C3__", 0x2000, 0x0, 0x22000) +AVR_MCU ("atxmega128d4", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega128D4__", 0x2000, 0x0, 0x22000) +AVR_MCU ("atxmega192a3u", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega192A3U__", 0x2000, 0x0, 0x32000) +AVR_MCU ("atxmega192c3", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega192C3__", 0x2000, 0x0, 0x32000) +AVR_MCU ("atxmega256a3u", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega256A3U__", 0x2000, 0x0, 0x42000) +AVR_MCU ("atxmega256c3", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega256C3__", 0x2000, 0x0, 0x42000) +AVR_MCU ("atxmega384c3", ARCH_AVRXMEGA6, AVR_ISA_RMW, "__AVR_ATxmega384C3__", 0x2000, 0x0, 0x62000) +AVR_MCU ("atxmega384d3", ARCH_AVRXMEGA6, AVR_ISA_NONE, "__AVR_ATxmega384D3__", 0x2000, 0x0, 0x62000) /* Xmega, 128K < Flash, RAM > 64K RAM. */ -AVR_MCU ("avrxmega7", ARCH_AVRXMEGA7, AVR_ISA_NONE, NULL, 0x2000, 0x0, 3, 0x22000) -AVR_MCU ("atxmega128a1", ARCH_AVRXMEGA7, AVR_ISA_NONE, "__AVR_ATxmega128A1__", 0x2000, 0x0, 3, 0x22000) -AVR_MCU ("atxmega128a1u", ARCH_AVRXMEGA7, AVR_ISA_RMW, "__AVR_ATxmega128A1U__", 0x2000, 0x0, 3, 0x22000) -AVR_MCU ("atxmega128a4u", ARCH_AVRXMEGA7, AVR_ISA_RMW, "__AVR_ATxmega128A4U__", 0x2000, 0x0, 3, 0x22000) +AVR_MCU ("avrxmega7", ARCH_AVRXMEGA7, AVR_ISA_NONE, NULL, 0x2000, 0x0, 0x22000) +AVR_MCU ("atxmega128a1", ARCH_AVRXMEGA7, AVR_ISA_NONE, "__AVR_ATxmega128A1__", 0x2000, 0x0, 0x22000) +AVR_MCU ("atxmega128a1u", ARCH_AVRXMEGA7, AVR_ISA_RMW, "__AVR_ATxmega128A1U__", 0x2000, 0x0, 0x22000) +AVR_MCU ("atxmega128a4u", ARCH_AVRXMEGA7, AVR_ISA_RMW, "__AVR_ATxmega128A4U__", 0x2000, 0x0, 0x22000) /* Tiny family */ -AVR_MCU ("avrtiny", ARCH_AVRTINY, AVR_ISA_NONE, NULL, 0x0040, 0x0, 1, 0x400) -AVR_MCU ("attiny4", ARCH_AVRTINY, AVR_ISA_LDS, "__AVR_ATtiny4__", 0x0040, 0x0, 1, 0x200) -AVR_MCU ("attiny5", ARCH_AVRTINY, AVR_ISA_LDS, "__AVR_ATtiny5__", 0x0040, 0x0, 1, 0x200) -AVR_MCU ("attiny9", ARCH_AVRTINY, AVR_ISA_LDS, "__AVR_ATtiny9__", 0x0040, 0x0, 1, 0x400) -AVR_MCU ("attiny10", ARCH_AVRTINY, AVR_ISA_LDS, "__AVR_ATtiny10__", 0x0040, 0x0, 1, 0x400) -AVR_MCU ("attiny20", ARCH_AVRTINY, AVR_ISA_LDS, "__AVR_ATtiny20__", 0x0040, 0x0, 1, 0x800) -AVR_MCU ("attiny40", ARCH_AVRTINY, AVR_ISA_NONE, "__AVR_ATtiny40__", 0x0040, 0x0, 1, 0x1000) +AVR_MCU ("avrtiny", ARCH_AVRTINY, AVR_ISA_NONE, NULL, 0x0040, 0x0, 0x400) +AVR_MCU ("attiny4", ARCH_AVRTINY, AVR_ISA_LDS, "__AVR_ATtiny4__", 0x0040, 0x0, 0x200) +AVR_MCU ("attiny5", ARCH_AVRTINY, AVR_ISA_LDS, "__AVR_ATtiny5__", 0x0040, 0x0, 0x200) +AVR_MCU ("attiny9", ARCH_AVRTINY, AVR_ISA_LDS, "__AVR_ATtiny9__", 0x0040, 0x0, 0x400) +AVR_MCU ("attiny10", ARCH_AVRTINY, AVR_ISA_LDS, "__AVR_ATtiny10__", 0x0040, 0x0, 0x400) +AVR_MCU ("attiny20", ARCH_AVRTINY, AVR_ISA_LDS, "__AVR_ATtiny20__", 0x0040, 0x0, 0x800) +AVR_MCU ("attiny40", ARCH_AVRTINY, AVR_ISA_NONE, "__AVR_ATtiny40__", 0x0040, 0x0, 0x1000) /* Assembler only. */ -AVR_MCU ("avr1", ARCH_AVR1, AVR_ISA_NONE, NULL, 0x0060, 0x0, 1, 0x400) -AVR_MCU ("at90s1200", ARCH_AVR1, AVR_ISA_NONE, "__AVR_AT90S1200__", 0x0060, 0x0, 1, 0x400) -AVR_MCU ("attiny11", ARCH_AVR1, AVR_ISA_NONE, "__AVR_ATtiny11__", 0x0060, 0x0, 1, 0x400) -AVR_MCU ("attiny12", ARCH_AVR1, AVR_ISA_NONE, "__AVR_ATtiny12__", 0x0060, 0x0, 1, 0x400) -AVR_MCU ("attiny15", ARCH_AVR1, AVR_ISA_NONE, "__AVR_ATtiny15__", 0x0060, 0x0, 1, 0x400) -AVR_MCU ("attiny28", ARCH_AVR1, AVR_ISA_NONE, "__AVR_ATtiny28__", 0x0060, 0x0, 1, 0x800) +AVR_MCU ("avr1", ARCH_AVR1, AVR_ISA_NONE, NULL, 0x0060, 0x0, 0x400) +AVR_MCU ("at90s1200", ARCH_AVR1, AVR_ISA_NONE, "__AVR_AT90S1200__", 0x0060, 0x0, 0x400) +AVR_MCU ("attiny11", ARCH_AVR1, AVR_ISA_NONE, "__AVR_ATtiny11__", 0x0060, 0x0, 0x400) +AVR_MCU ("attiny12", ARCH_AVR1, AVR_ISA_NONE, "__AVR_ATtiny12__", 0x0060, 0x0, 0x400) +AVR_MCU ("attiny15", ARCH_AVR1, AVR_ISA_NONE, "__AVR_ATtiny15__", 0x0060, 0x0, 0x400) +AVR_MCU ("attiny28", ARCH_AVR1, AVR_ISA_NONE, "__AVR_ATtiny28__", 0x0060, 0x0, 0x800) diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c index b7fd8798..48f4788 100644 --- a/gcc/config/avr/avr.c +++ b/gcc/config/avr/avr.c @@ -707,7 +707,7 @@ avr_set_core_architecture (void) { avr_arch = &avr_arch_types[mcu->arch_id]; if (avr_n_flash < 0) - avr_n_flash = mcu->n_flash; + avr_n_flash = 1 + (mcu->flash_size - 1) / 0x10000; return true; } diff --git a/gcc/config/avr/gen-avr-mmcu-specs.c b/gcc/config/avr/gen-avr-mmcu-specs.c index ee75b1e..48a749b 100644 --- a/gcc/config/avr/gen-avr-mmcu-specs.c +++ b/gcc/config/avr/gen-avr-mmcu-specs.c @@ -179,8 +179,10 @@ print_mcu (const avr_mcu_t *mcu) // avr-gcc specific specs for the compilation / the compiler proper. + int n_flash = 1 + (mcu->flash_size - 1) / 0x10000; + fprintf (f, "*cc1_n_flash:\n" - "\t%%{!mn-flash=*:-mn-flash=%d}\n\n", mcu->n_flash); + "\t%%{!mn-flash=*:-mn-flash=%d}\n\n", n_flash); fprintf (f, "*cc1_rmw:\n%s\n\n", rmw ? "\t%{!mno-rmw: -mrmw}" -- cgit v1.1 From efa68ffca9f9032e795ae84c5642038bd613c253 Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Fri, 2 Dec 2016 11:52:58 +0000 Subject: S/390: Fix RTL sharing when generating reg note. gcc/ChangeLog: 2016-12-02 Andreas Krebbel * config/s390/s390.c (s390_save_gprs_to_fprs): Fix RTL sharing problem. From-SVN: r243173 --- gcc/ChangeLog | 5 +++++ gcc/config/s390/s390.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a0cefa7..03387cf 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Andreas Krebbel + + * config/s390/s390.c (s390_save_gprs_to_fprs): Fix RTL sharing + problem. + 2016-12-02 Georg-Johann Lay * config/avr/avr-arch.h (avr_mcu_t) [n_flash]: Remove field. diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 767666e..030e10d 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -10666,7 +10666,7 @@ s390_save_gprs_to_fprs (void) /* This prevents dwarf2cfi from interpreting the set. Doing so it might emit def_cfa_register infos setting an FPR as new CFA. */ - add_reg_note (insn, REG_CFA_REGISTER, PATTERN (insn)); + add_reg_note (insn, REG_CFA_REGISTER, copy_rtx (PATTERN (insn))); } } } -- cgit v1.1 From cc9037a6e93f50efae6494a849c09a7ffa4b253f Mon Sep 17 00:00:00 2001 From: Aldy Hernandez Date: Fri, 2 Dec 2016 12:20:42 +0000 Subject: re PR middle-end/78328 (wrong wording for unbounded alloc case in -Walloca-larger-than note) PR middle-end/78328 * gimple-ssa-warn-alloca.c (alloca_call_type): Handle VR_ANTI_RANGE. From-SVN: r243174 --- gcc/ChangeLog | 6 ++++++ gcc/gimple-ssa-warn-alloca.c | 2 ++ gcc/testsuite/gcc.dg/Walloca-12.c | 11 +++++++++++ 3 files changed, 19 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/Walloca-12.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 03387cf..afac973 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2016-12-02 Aldy Hernandez + + PR middle-end/78328 + * gimple-ssa-warn-alloca.c (alloca_call_type): Handle + VR_ANTI_RANGE. + 2016-12-02 Andreas Krebbel * config/s390/s390.c (s390_save_gprs_to_fprs): Fix RTL sharing diff --git a/gcc/gimple-ssa-warn-alloca.c b/gcc/gimple-ssa-warn-alloca.c index e75f2fa..ae379f9 100644 --- a/gcc/gimple-ssa-warn-alloca.c +++ b/gcc/gimple-ssa-warn-alloca.c @@ -339,6 +339,8 @@ alloca_call_type (gimple *stmt, bool is_vla, tree *invalid_casted_type) { // Fall through. } + else if (range_type == VR_ANTI_RANGE) + return alloca_type_and_limit (ALLOCA_UNBOUNDED); else if (range_type != VR_VARYING) return alloca_type_and_limit (ALLOCA_BOUND_MAYBE_LARGE, max); diff --git a/gcc/testsuite/gcc.dg/Walloca-12.c b/gcc/testsuite/gcc.dg/Walloca-12.c new file mode 100644 index 0000000..5d71cda --- /dev/null +++ b/gcc/testsuite/gcc.dg/Walloca-12.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-Walloca-larger-than=128 -O2" } */ + +void f (void*); + +void g (unsigned int n) +{ + if (n == 7) + n = 11; + f (__builtin_alloca (n)); /* { dg-warning "unbounded use of 'alloca'" } */ +} -- cgit v1.1 From d003d97f1d17b7ca89668fdf71d8a7f9c839cc2b Mon Sep 17 00:00:00 2001 From: Dominik Vogt Date: Fri, 2 Dec 2016 12:32:16 +0000 Subject: S/390: Fix setmem-long test. Adding a " in the scan-assembler pattern is necessary because of a recent change in print-rtl.c. gcc/testsuite/ChangeLog: 2016-12-02 Dominik Vogt * gcc.target/s390/md/setmem_long-1.c: Fix test. From-SVN: r243176 --- gcc/testsuite/ChangeLog | 4 ++++ gcc/testsuite/gcc.target/s390/md/setmem_long-1.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 490d081..c1416d0 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2016-12-02 Dominik Vogt + + * gcc.target/s390/md/setmem_long-1.c: Fix test. + 2016-12-02 Jakub Jelinek PR rtl-optimization/78547 diff --git a/gcc/testsuite/gcc.target/s390/md/setmem_long-1.c b/gcc/testsuite/gcc.target/s390/md/setmem_long-1.c index 933a698..bd0c594 100644 --- a/gcc/testsuite/gcc.target/s390/md/setmem_long-1.c +++ b/gcc/testsuite/gcc.target/s390/md/setmem_long-1.c @@ -16,8 +16,8 @@ void test2(char *p, int c, int len) } /* Check that the right patterns are used. */ -/* { dg-final { scan-assembler-times {c:9 .*{[*]setmem_long_?3?1?z?}} 1 } } */ -/* { dg-final { scan-assembler-times {c:15 .*{[*]setmem_long_and_?3?1?z?}} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {c"?:9 .*{[*]setmem_long_?3?1?z?}} 1 } } */ +/* { dg-final { scan-assembler-times {c"?:15 .*{[*]setmem_long_and_?3?1?z?}} 1 { xfail *-*-* } } } */ #define LEN 500 char buf[LEN + 2]; -- cgit v1.1 From 03fd1ef632f41caaaa38a80712ed7eda3f97fdff Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Fri, 2 Dec 2016 13:14:01 +0000 Subject: diagnostic.c (diagnostic_report_diagnostic): Remove extraneous braces. * diagnostic.c (diagnostic_report_diagnostic): Remove extraneous braces. From-SVN: r243177 --- gcc/ChangeLog | 5 +++++ gcc/diagnostic.c | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index afac973..7da0aca 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Nathan Sidwell + + * diagnostic.c (diagnostic_report_diagnostic): Remove extraneous + braces. + 2016-12-02 Aldy Hernandez PR middle-end/78328 diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c index 2304e14..4278a10 100644 --- a/gcc/diagnostic.c +++ b/gcc/diagnostic.c @@ -834,9 +834,7 @@ diagnostic_report_diagnostic (diagnostic_context *context, -Wno-error=*. */ if (context->warning_as_error_requested && diagnostic->kind == DK_WARNING) - { - diagnostic->kind = DK_ERROR; - } + diagnostic->kind = DK_ERROR; if (diagnostic->option_index && diagnostic->option_index != permissive_error_option (context)) -- cgit v1.1 From 310a7f96996f46f1565376f092e08daa1d44d1e7 Mon Sep 17 00:00:00 2001 From: Jason Merrill Date: Fri, 2 Dec 2016 08:58:32 -0500 Subject: call.c (add_function_candidate): Also exclude inherited ctors that take a type reference-related to the derived... * call.c (add_function_candidate): Also exclude inherited ctors that take a type reference-related to the derived class. From-SVN: r243178 --- gcc/cp/ChangeLog | 3 +++ gcc/cp/call.c | 23 +++++------------------ 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog index b407d17..4977ff2 100644 --- a/gcc/cp/ChangeLog +++ b/gcc/cp/ChangeLog @@ -1,5 +1,8 @@ 2016-12-01 Jason Merrill + * call.c (add_function_candidate): Also exclude inherited ctors + that take a type reference-related to the derived class. + * call.c (add_function_candidate): Exclude inherited copy/move ctors. diff --git a/gcc/cp/call.c b/gcc/cp/call.c index 561cc83..b7aa97c 100644 --- a/gcc/cp/call.c +++ b/gcc/cp/call.c @@ -2042,19 +2042,18 @@ add_function_candidate (struct z_candidate **candidates, reason = arity_rejection (first_arg, i + remaining, len); } - /* A constructor that is a direct member of a class C and has a first + /* An inherited constructor (12.6.3 [class.inhctor.init]) that has a first parameter of type "reference to cv C" (including such a constructor instantiated from a template) is excluded from the set of candidate - functions when used to construct an object of type derived from C (12.6.3 - [class.inhctor.init]) with an argument list containing a single - argument. */ + functions when used to construct an object of type D with an argument list + containing a single argument if C is reference-related to D. */ if (viable && len == 1 && parmlist && DECL_CONSTRUCTOR_P (fn) && flag_new_inheriting_ctors && DECL_INHERITED_CTOR (fn)) { tree ptype = non_reference (TREE_VALUE (parmlist)); - tree ctype = DECL_INHERITED_CTOR_BASE (fn); - if (same_type_ignoring_top_level_qualifiers_p (ptype, ctype)) + tree dtype = DECL_CONTEXT (fn); + if (reference_related_p (ptype, dtype)) { viable = false; reason = inherited_ctor_rejection (); @@ -2161,18 +2160,6 @@ add_function_candidate (struct z_candidate **candidates, } } - /* Don't consider inherited constructors for initialization from an - expression of the same or derived type. */ - /* FIXME extend to operator=. */ - if (i == 0 && len == 1 - && DECL_INHERITED_CTOR (fn) - && reference_related_p (ctype, argtype)) - { - viable = 0; - reason = inherited_ctor_rejection (); - goto out; - } - /* Core issue 899: When [copy-]initializing a temporary to be bound to the first parameter of a copy constructor (12.8) called with a single argument in the context of direct-initialization, -- cgit v1.1 From d313d52cd5fff9374f01967fb7964b6633df219c Mon Sep 17 00:00:00 2001 From: Sebastian Huber Date: Fri, 2 Dec 2016 14:10:33 +0000 Subject: [RTEMS] Fix libgomp for nthreads == 1 libgomp/ * config/rtems/pool.h (gomp_get_thread_pool): Return proper thread pool in case nthreads == 1. From-SVN: r243179 --- libgomp/ChangeLog | 5 +++++ libgomp/config/rtems/pool.h | 26 +++++++++++--------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index 9e2a300..f072ce4 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Sebastian Huber + + * config/rtems/pool.h (gomp_get_thread_pool): Return proper + thread pool in case nthreads == 1. + 2016-11-30 Alexander Monakov * config/nvptx/env.c: Delete. diff --git a/libgomp/config/rtems/pool.h b/libgomp/config/rtems/pool.h index 7520c07..e69eca4 100644 --- a/libgomp/config/rtems/pool.h +++ b/libgomp/config/rtems/pool.h @@ -87,28 +87,24 @@ static inline struct gomp_thread_pool * gomp_get_thread_pool (struct gomp_thread *thr, unsigned nthreads) { struct gomp_thread_pool *pool; + struct gomp_thread_pool_reservoir *res; if (__builtin_expect (thr->thread_pool == NULL, 0)) pthread_setspecific (gomp_thread_destructor, thr); - if (nthreads != 1) + res = gomp_get_thread_pool_reservoir (); + if (res != NULL) { - struct gomp_thread_pool_reservoir *res = - gomp_get_thread_pool_reservoir (); - if (res != NULL) - { - gomp_sem_wait (&res->available); - gomp_mutex_lock (&res->lock); - pool = res->pools[--res->index]; - gomp_mutex_unlock (&res->lock); - pool->threads_busy = nthreads; - thr->thread_pool = pool; - } - else - pool = gomp_get_own_thread_pool (thr, nthreads); + gomp_sem_wait (&res->available); + gomp_mutex_lock (&res->lock); + pool = res->pools[--res->index]; + gomp_mutex_unlock (&res->lock); + pool->threads_busy = nthreads; + thr->thread_pool = pool; } else - pool = NULL; + pool = gomp_get_own_thread_pool (thr, nthreads); + return pool; } -- cgit v1.1 From 714445ae04640bc096693623fb805bcf14148663 Mon Sep 17 00:00:00 2001 From: Bin Cheng Date: Fri, 2 Dec 2016 14:13:11 +0000 Subject: match.pd: Add new pattern: (cond (cmp (convert? * match.pd: Add new pattern: (cond (cmp (convert? x) c1) (op x c2) c3) -> (op (minmax x c1) c2). gcc/testsuite * gcc.dg/fold-bopcond-1.c: New test. * gcc.dg/fold-bopcond-2.c: New test. From-SVN: r243180 --- gcc/ChangeLog | 5 ++ gcc/match.pd | 100 ++++++++++++++++++++++++++++++++++ gcc/testsuite/ChangeLog | 5 ++ gcc/testsuite/gcc.dg/fold-bopcond-1.c | 48 ++++++++++++++++ gcc/testsuite/gcc.dg/fold-bopcond-2.c | 48 ++++++++++++++++ 5 files changed, 206 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/fold-bopcond-1.c create mode 100644 gcc/testsuite/gcc.dg/fold-bopcond-2.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7da0aca..85b4bdb 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Bin Cheng + + * match.pd: Add new pattern: + (cond (cmp (convert? x) c1) (op x c2) c3) -> (op (minmax x c1) c2). + 2016-12-02 Nathan Sidwell * diagnostic.c (diagnostic_report_diagnostic): Remove extraneous diff --git a/gcc/match.pd b/gcc/match.pd index bc8a5e7..dbb9103 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -2038,6 +2038,106 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (convert (cond (eq @1 (convert @3)) (convert:from_type @3) (convert:from_type @2))))))))) +/* (cond (cmp (convert? x) c1) (op x c2) c3) -> (op (minmax x c1) c2) if: + + 1) OP is PLUS or MINUS. + 2) CMP is LT, LE, GT or GE. + 3) C3 == (C1 op C2), and computation doesn't have undefined behavior. + + This pattern also handles special cases like: + + A) Operand x is a unsigned to signed type conversion and c1 is + integer zero. In this case, + (signed type)x < 0 <=> x > MAX_VAL(signed type) + (signed type)x >= 0 <=> x <= MAX_VAL(signed type) + B) Const c1 may not equal to (C3 op' C2). In this case we also + check equality for (c1+1) and (c1-1) by adjusting comparison + code. + + TODO: Though signed type is handled by this pattern, it cannot be + simplified at the moment because C standard requires additional + type promotion. In order to match&simplify it here, the IR needs + to be cleaned up by other optimizers, i.e, VRP. */ +(for op (plus minus) + (for cmp (lt le gt ge) + (simplify + (cond (cmp (convert? @X) INTEGER_CST@1) (op @X INTEGER_CST@2) INTEGER_CST@3) + (with { tree from_type = TREE_TYPE (@X), to_type = TREE_TYPE (@1); } + (if (types_match (from_type, to_type) + /* Check if it is special case A). */ + || (TYPE_UNSIGNED (from_type) + && !TYPE_UNSIGNED (to_type) + && TYPE_PRECISION (from_type) == TYPE_PRECISION (to_type) + && integer_zerop (@1) + && (cmp == LT_EXPR || cmp == GE_EXPR))) + (with + { + bool overflow = false; + enum tree_code code, cmp_code = cmp; + wide_int real_c1, c1 = @1, c2 = @2, c3 = @3; + signop sgn = TYPE_SIGN (from_type); + + /* Handle special case A), given x of unsigned type: + ((signed type)x < 0) <=> (x > MAX_VAL(signed type)) + ((signed type)x >= 0) <=> (x <= MAX_VAL(signed type)) */ + if (!types_match (from_type, to_type)) + { + if (cmp_code == LT_EXPR) + cmp_code = GT_EXPR; + if (cmp_code == GE_EXPR) + cmp_code = LE_EXPR; + c1 = wi::max_value (to_type); + } + /* To simplify this pattern, we require c3 = (c1 op c2). Here we + compute (c3 op' c2) and check if it equals to c1 with op' being + the inverted operator of op. Make sure overflow doesn't happen + if it is undefined. */ + if (op == PLUS_EXPR) + real_c1 = wi::sub (c3, c2, sgn, &overflow); + else + real_c1 = wi::add (c3, c2, sgn, &overflow); + + code = cmp_code; + if (!overflow || !TYPE_OVERFLOW_UNDEFINED (from_type)) + { + /* Check if c1 equals to real_c1. Boundary condition is handled + by adjusting comparison operation if necessary. */ + if (!wi::cmp (wi::sub (real_c1, 1, sgn, &overflow), c1, sgn) + && !overflow) + { + /* X <= Y - 1 equals to X < Y. */ + if (cmp_code == LE_EXPR) + code = LT_EXPR; + /* X > Y - 1 equals to X >= Y. */ + if (cmp_code == GT_EXPR) + code = GE_EXPR; + } + if (!wi::cmp (wi::add (real_c1, 1, sgn, &overflow), c1, sgn) + && !overflow) + { + /* X < Y + 1 equals to X <= Y. */ + if (cmp_code == LT_EXPR) + code = LE_EXPR; + /* X >= Y + 1 equals to X > Y. */ + if (cmp_code == GE_EXPR) + code = GT_EXPR; + } + if (code != cmp_code || !wi::cmp (real_c1, c1, sgn)) + { + if (cmp_code == LT_EXPR || cmp_code == LE_EXPR) + code = MIN_EXPR; + if (cmp_code == GT_EXPR || cmp_code == GE_EXPR) + code = MAX_EXPR; + } + } + } + (if (code == MAX_EXPR) + (op (max @X { wide_int_to_tree (from_type, real_c1); }) + { wide_int_to_tree (from_type, c2); }) + (if (code == MIN_EXPR) + (op (min @X { wide_int_to_tree (from_type, real_c1); }) + { wide_int_to_tree (from_type, c2); }))))))))) + (for cnd (cond vec_cond) /* A ? B : (A ? X : C) -> A ? B : C. */ (simplify diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index c1416d0..143687d 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Bin Cheng + + * gcc.dg/fold-bopcond-1.c: New test. + * gcc.dg/fold-bopcond-2.c: New test. + 2016-12-02 Dominik Vogt * gcc.target/s390/md/setmem_long-1.c: Fix test. diff --git a/gcc/testsuite/gcc.dg/fold-bopcond-1.c b/gcc/testsuite/gcc.dg/fold-bopcond-1.c new file mode 100644 index 0000000..7324c16 --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-bopcond-1.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fdump-tree-ifcvt" } */ + +int foo1 (unsigned short a[], unsigned int x) +{ + unsigned int i; + for (i = 0; i < 1000; i++) + { + x = a[i]; + a[i] = (unsigned short)(x <= 32768 ? x + 32768 : 0); + } + return x; +} + +int foo2 (unsigned short a[], unsigned int x) +{ + unsigned int i; + for (i = 0; i < 1000; i++) + { + x = a[i]; + a[i] = (unsigned short)(x < 32768 ? x + 32768 : 0); + } + return x; +} + +int foo3 (unsigned short a[], unsigned int x) +{ + unsigned int i; + for (i = 0; i < 1000; i++) + { + x = a[i]; + a[i] = (unsigned short)(x < 1000 ? x - 1000 : 0); + } + return x; +} + +int foo4 (unsigned short a[], unsigned int x) +{ + unsigned int i; + for (i = 0; i < 1000; i++) + { + x = a[i]; + a[i] = (unsigned short)(x <= 2 ? x + 999 : 1001); + } + return x; +} + +/* { dg-final { scan-tree-dump-times "MIN_EXPR " 4 "ifcvt" } } */ diff --git a/gcc/testsuite/gcc.dg/fold-bopcond-2.c b/gcc/testsuite/gcc.dg/fold-bopcond-2.c new file mode 100644 index 0000000..7a47449 --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-bopcond-2.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fdump-tree-ifcvt" } */ + +int foo1 (unsigned short a[], unsigned int x) +{ + unsigned int i; + for (i = 0; i < 1000; i++) + { + x = a[i]; + a[i] = (unsigned short)(x >= 32768 ? x - 32768 : 0); + } + return x; +} + +int foo2 (unsigned short a[], unsigned int x) +{ + unsigned int i; + for (i = 0; i < 1000; i++) + { + x = a[i]; + a[i] = (unsigned short)(x > 32768 ? x - 32768 : 0); + } + return x; +} + +int foo3 (unsigned short a[], unsigned int x) +{ + unsigned int i; + for (i = 0; i < 1000; i++) + { + x = a[i]; + a[i] = (unsigned short)(x > 1000 ? x - 1000 : 0); + } + return x; +} + +int foo4 (unsigned short a[], unsigned int x) +{ + unsigned int i; + for (i = 0; i < 1000; i++) + { + x = a[i]; + a[i] = (unsigned short)(x >= 2 ? x - 32768 : 32770); + } + return x; +} + +/* { dg-final { scan-tree-dump-times "MAX_EXPR " 4 "ifcvt" } } */ -- cgit v1.1 From e7a7f4bea8356e24ca5f2c25a75dc005c7492f89 Mon Sep 17 00:00:00 2001 From: Sebastian Huber Date: Fri, 2 Dec 2016 14:13:12 +0000 Subject: [RTEMS] Use spin lock for pool management libgomp/ * libgomp/config/rtems/pool.h (gomp_thread_pool_reservoir): Use pthread_spinlock_t instead of gomp_mutex_t lock. (gomp_get_thread_pool): Likewise. (gomp_release_thread_pool): Likewise. * libgomp/config/rtems/proc.c (allocate_thread_pool_reservoir): Likewise. From-SVN: r243181 --- libgomp/ChangeLog | 9 +++++++++ libgomp/config/rtems/pool.h | 10 +++++----- libgomp/config/rtems/proc.c | 2 +- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog index f072ce4..469e896 100644 --- a/libgomp/ChangeLog +++ b/libgomp/ChangeLog @@ -1,5 +1,14 @@ 2016-12-02 Sebastian Huber + * libgomp/config/rtems/pool.h (gomp_thread_pool_reservoir): Use + pthread_spinlock_t instead of gomp_mutex_t lock. + (gomp_get_thread_pool): Likewise. + (gomp_release_thread_pool): Likewise. + * libgomp/config/rtems/proc.c (allocate_thread_pool_reservoir): + Likewise. + +2016-12-02 Sebastian Huber + * config/rtems/pool.h (gomp_get_thread_pool): Return proper thread pool in case nthreads == 1. diff --git a/libgomp/config/rtems/pool.h b/libgomp/config/rtems/pool.h index e69eca4..83fddc8 100644 --- a/libgomp/config/rtems/pool.h +++ b/libgomp/config/rtems/pool.h @@ -39,7 +39,7 @@ GOMP_RTEMS_THREAD_POOLS environment variable. */ struct gomp_thread_pool_reservoir { gomp_sem_t available; - gomp_mutex_t lock; + pthread_spinlock_t lock; size_t index; int priority; struct gomp_thread_pool *pools[]; @@ -96,9 +96,9 @@ gomp_get_thread_pool (struct gomp_thread *thr, unsigned nthreads) if (res != NULL) { gomp_sem_wait (&res->available); - gomp_mutex_lock (&res->lock); + pthread_spin_lock (&res->lock); pool = res->pools[--res->index]; - gomp_mutex_unlock (&res->lock); + pthread_spin_unlock (&res->lock); pool->threads_busy = nthreads; thr->thread_pool = pool; } @@ -115,9 +115,9 @@ gomp_release_thread_pool (struct gomp_thread_pool *pool) gomp_tls_rtems_data.thread_pool_reservoir; if (res != NULL) { - gomp_mutex_lock (&res->lock); + pthread_spin_lock (&res->lock); res->pools[res->index++] = pool; - gomp_mutex_unlock (&res->lock); + pthread_spin_unlock (&res->lock); gomp_sem_post (&res->available); } } diff --git a/libgomp/config/rtems/proc.c b/libgomp/config/rtems/proc.c index d4123d2..5e04b47 100644 --- a/libgomp/config/rtems/proc.c +++ b/libgomp/config/rtems/proc.c @@ -66,7 +66,7 @@ allocate_thread_pool_reservoir (unsigned long count, unsigned long priority, res->index = count; res->priority = priority; gomp_sem_init (&res->available, count); - gomp_mutex_init (&res->lock); + pthread_spin_init (&res->lock, PTHREAD_PROCESS_PRIVATE); for (i = 0; i < count; ++i) res->pools[i] = &pools[i]; gomp_thread_pool_reservoirs[scheduler] = res; -- cgit v1.1 From 474bbda1675d35ab6f78ffdf20ee091f8402d185 Mon Sep 17 00:00:00 2001 From: James Greenhalgh Date: Fri, 2 Dec 2016 14:29:35 +0000 Subject: [Patch 1/2 PR78561] Rename get_pool_size to get_pool_size_upper_bound gcc/ PR rtl-optimization/78561 * config/rs6000/rs6000.c (rs6000_reg_live_or_pic_offset_p) Rename get_pool_size to get_pool_size_upper_bound. (rs6000_stack_info): Likewise. (rs6000_emit_prologue): Likewise. (rs6000_elf_declare_function_name): Likewise. (rs6000_set_up_by_prologue): Likewise. (rs6000_can_eliminate): Likewise, reformat spaces to tabs. * output.h (get_pool_size): Rename to... (get_pool_size_upper_bound): ...This. * varasm.c (get_pool_size): Rename to... (get_pool_size_upper_bound): ...This. From-SVN: r243182 --- gcc/ChangeLog | 15 +++++++++++++++ gcc/config/rs6000/rs6000.c | 23 +++++++++++++---------- gcc/output.h | 7 +++++-- gcc/varasm.c | 2 +- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 85b4bdb..7348684 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,18 @@ +2016-12-02 James Greenhalgh + + PR rtl-optimization/78561 + * config/rs6000/rs6000.c (rs6000_reg_live_or_pic_offset_p) Rename + get_pool_size to get_pool_size_upper_bound. + (rs6000_stack_info): Likewise. + (rs6000_emit_prologue): Likewise. + (rs6000_elf_declare_function_name): Likewise. + (rs6000_set_up_by_prologue): Likewise. + (rs6000_can_eliminate): Likewise, reformat spaces to tabs. + * output.h (get_pool_size): Rename to... + (get_pool_size_upper_bound): ...This. + * varasm.c (get_pool_size): Rename to... + (get_pool_size_upper_bound): ...This. + 2016-12-02 Bin Cheng * match.pd: Add new pattern: diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index e572620..425a885 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -25460,7 +25460,7 @@ rs6000_reg_live_or_pic_offset_p (int reg) if (TARGET_TOC && TARGET_MINIMAL_TOC && (crtl->calls_eh_return || df_regs_ever_live_p (reg) - || get_pool_size ())) + || get_pool_size_upper_bound ())) return true; if ((DEFAULT_ABI == ABI_V4 || DEFAULT_ABI == ABI_DARWIN) @@ -26266,7 +26266,7 @@ rs6000_stack_info (void) #ifdef TARGET_RELOCATABLE || (DEFAULT_ABI == ABI_V4 && (TARGET_RELOCATABLE || flag_pic > 1) - && get_pool_size () != 0) + && get_pool_size_upper_bound () != 0) #endif || rs6000_ra_ever_killed ()) info->lr_save_p = 1; @@ -28044,7 +28044,8 @@ rs6000_emit_prologue (void) cfun->machine->r2_setup_needed = df_regs_ever_live_p (TOC_REGNUM); /* With -mminimal-toc we may generate an extra use of r2 below. */ - if (TARGET_TOC && TARGET_MINIMAL_TOC && get_pool_size () != 0) + if (TARGET_TOC && TARGET_MINIMAL_TOC + && get_pool_size_upper_bound () != 0) cfun->machine->r2_setup_needed = true; } @@ -28899,7 +28900,8 @@ rs6000_emit_prologue (void) /* If we are using RS6000_PIC_OFFSET_TABLE_REGNUM, we need to set it up. */ if (!TARGET_SINGLE_PIC_BASE - && ((TARGET_TOC && TARGET_MINIMAL_TOC && get_pool_size () != 0) + && ((TARGET_TOC && TARGET_MINIMAL_TOC + && get_pool_size_upper_bound () != 0) || (DEFAULT_ABI == ABI_V4 && (flag_pic == 1 || (flag_pic && TARGET_SECURE_PLT)) && df_regs_ever_live_p (RS6000_PIC_OFFSET_TABLE_REGNUM)))) @@ -34966,7 +34968,7 @@ rs6000_elf_declare_function_name (FILE *file, const char *name, tree decl) if (DEFAULT_ABI == ABI_V4 && (TARGET_RELOCATABLE || flag_pic > 1) && !TARGET_SECURE_PLT - && (get_pool_size () != 0 || crtl->profile) + && (get_pool_size_upper_bound () != 0 || crtl->profile) && uses_TOC ()) { char buf[256]; @@ -37449,10 +37451,11 @@ static bool rs6000_can_eliminate (const int from, const int to) { return (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM - ? ! frame_pointer_needed - : from == RS6000_PIC_OFFSET_TABLE_REGNUM - ? ! TARGET_MINIMAL_TOC || TARGET_NO_TOC || get_pool_size () == 0 - : true); + ? ! frame_pointer_needed + : from == RS6000_PIC_OFFSET_TABLE_REGNUM + ? ! TARGET_MINIMAL_TOC || TARGET_NO_TOC + || get_pool_size_upper_bound () == 0 + : true); } /* Define the offset between two registers, FROM to be eliminated and its @@ -38988,7 +38991,7 @@ rs6000_set_up_by_prologue (struct hard_reg_set_container *set) if (!TARGET_SINGLE_PIC_BASE && TARGET_TOC && TARGET_MINIMAL_TOC - && get_pool_size () != 0) + && get_pool_size_upper_bound () != 0) add_to_hard_reg_set (&set->set, Pmode, RS6000_PIC_OFFSET_TABLE_REGNUM); if (cfun->machine->split_stack_argp_used) add_to_hard_reg_set (&set->set, Pmode, 12); diff --git a/gcc/output.h b/gcc/output.h index 0924499..7186dc1 100644 --- a/gcc/output.h +++ b/gcc/output.h @@ -287,8 +287,11 @@ extern void assemble_real (REAL_VALUE_TYPE, machine_mode, unsigned, /* Write the address of the entity given by SYMBOL to SEC. */ extern void assemble_addr_to_section (rtx, section *); -/* Return the size of the constant pool. */ -extern int get_pool_size (void); +/* Return the maximum size of the constant pool. This may be larger + than the final size of the constant pool, as entries may be added to + the constant pool which become unreferenced, or otherwise not need + output by the time we actually emit the pool. */ +extern int get_pool_size_upper_bound (void); extern rtx_insn *peephole (rtx_insn *); diff --git a/gcc/varasm.c b/gcc/varasm.c index 1e7c2b5..f8af0c1 100644 --- a/gcc/varasm.c +++ b/gcc/varasm.c @@ -3811,7 +3811,7 @@ get_pool_mode (const_rtx addr) /* Return the size of the constant pool. */ int -get_pool_size (void) +get_pool_size_upper_bound (void) { return crtl->varasm.pool->offset; } -- cgit v1.1 From 04c452f40ba95e15a76762e4bb5767d15cf8b322 Mon Sep 17 00:00:00 2001 From: James Greenhalgh Date: Fri, 2 Dec 2016 14:31:10 +0000 Subject: [Patch 2/2 PR78561] Recalculate constant pool size before emitting it gcc/ PR rtl-optimization/78561 * varasm.c (recompute_pool_offsets): New. (output_constant_pool): Call it. gcc/testsuite/ PR rtl-optimization/78561 * gcc.target/aarch64/pr78561.c: New. From-SVN: r243183 --- gcc/ChangeLog | 6 ++++++ gcc/testsuite/ChangeLog | 5 +++++ gcc/varasm.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7348684..92501fc 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,6 +1,12 @@ 2016-12-02 James Greenhalgh PR rtl-optimization/78561 + * varasm.c (recompute_pool_offsets): New. + (output_constant_pool): Call it. + +2016-12-02 James Greenhalgh + + PR rtl-optimization/78561 * config/rs6000/rs6000.c (rs6000_reg_live_or_pic_offset_p) Rename get_pool_size to get_pool_size_upper_bound. (rs6000_stack_info): Likewise. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 143687d..9970806 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 James Greenhalgh + + PR rtl-optimization/78561 + * gcc.target/aarch64/pr78561.c: New. + 2016-12-02 Bin Cheng * gcc.dg/fold-bopcond-1.c: New test. diff --git a/gcc/varasm.c b/gcc/varasm.c index f8af0c1..f3cd70a 100644 --- a/gcc/varasm.c +++ b/gcc/varasm.c @@ -3942,6 +3942,29 @@ output_constant_pool_1 (struct constant_descriptor_rtx *desc, return; } +/* Recompute the offsets of entries in POOL, and the overall size of + POOL. Do this after calling mark_constant_pool to ensure that we + are computing the offset values for the pool which we will actually + emit. */ + +static void +recompute_pool_offsets (struct rtx_constant_pool *pool) +{ + struct constant_descriptor_rtx *desc; + pool->offset = 0; + + for (desc = pool->first; desc ; desc = desc->next) + if (desc->mark) + { + /* Recalculate offset. */ + unsigned int align = desc->align; + pool->offset += (align / BITS_PER_UNIT) - 1; + pool->offset &= ~ ((align / BITS_PER_UNIT) - 1); + desc->offset = pool->offset; + pool->offset += GET_MODE_SIZE (desc->mode); + } +} + /* Mark all constants that are referenced by SYMBOL_REFs in X. Emit referenced deferred strings. */ @@ -4060,6 +4083,11 @@ output_constant_pool (const char *fnname ATTRIBUTE_UNUSED, case we do not need to output the constant. */ mark_constant_pool (); + /* Having marked the constant pool entries we'll actually emit, we + now need to rebuild the offset information, which may have become + stale. */ + recompute_pool_offsets (pool); + #ifdef ASM_OUTPUT_POOL_PROLOGUE ASM_OUTPUT_POOL_PROLOGUE (asm_out_file, fnname, fndecl, pool->offset); #endif -- cgit v1.1 From 69a71a6d071a5eae8a06282351e53d8c383aba9a Mon Sep 17 00:00:00 2001 From: Martin Jambor Date: Fri, 2 Dec 2016 15:42:15 +0100 Subject: [hsa] Exclude parallel outlines from hsa_callable_functions_p 2016-12-09 Martin Jambor * hsa.c (hsa_callable_function_p): Return false for artificial functions. From-SVN: r243184 --- gcc/ChangeLog | 5 +++++ gcc/hsa.c | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 92501fc..0880c84 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-09 Martin Jambor + + * hsa.c (hsa_callable_function_p): Return false for artificial + functions. + 2016-12-02 James Greenhalgh PR rtl-optimization/78561 diff --git a/gcc/hsa.c b/gcc/hsa.c index f881e78..31e3252 100644 --- a/gcc/hsa.c +++ b/gcc/hsa.c @@ -90,7 +90,10 @@ bool hsa_callable_function_p (tree fndecl) { return (lookup_attribute ("omp declare target", DECL_ATTRIBUTES (fndecl)) - && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (fndecl))); + && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (fndecl)) + /* At this point, this is enough to identify clones for + parallel, which for HSA would need to be kernels anyway. */ + && !DECL_ARTIFICIAL (fndecl)); } /* Allocate HSA structures that are are used when dealing with different -- cgit v1.1 From c5af52eb8cc208890ccb3bdce75b35bf5bbaa8bf Mon Sep 17 00:00:00 2001 From: Cesar Philippidis Date: Fri, 2 Dec 2016 06:54:39 -0800 Subject: c-parser.c (c_parser_pragma): Error when PRAGMA_OACC_{ENTER_DATA... gcc/c/ * c-parser.c (c_parser_pragma): Error when PRAGMA_OACC_{ENTER_DATA, EXIT_DATA,WAIT} are not used in compound statements. (c_parser_oacc_enter_exit_data): Update diagnostics. gcc/cp/ * parser.c (cp_parser_oacc_enter_exit_data): Update diagnostics. (cp_parser_pragma): Error when PRAGMA_OACC_{ENTER_DATA, EXIT_DATA,WAIT} are not used in compound statements. gcc/testsuite/ * c-c++-common/goacc/data-2.c: Adjust test. * c-c++-common/goacc/executeables-1.c: New test. * g++.dg/goacc/data-1.C: Adjust test. Co-Authored-By: James Norris From-SVN: r243185 --- gcc/c/ChangeLog | 7 +++ gcc/c/c-parser.c | 42 ++++++++++--- gcc/cp/ChangeLog | 7 +++ gcc/cp/parser.c | 75 +++++++++++++++++------ gcc/testsuite/ChangeLog | 7 +++ gcc/testsuite/c-c++-common/goacc/data-2.c | 12 ++-- gcc/testsuite/c-c++-common/goacc/executeables-1.c | 74 ++++++++++++++++++++++ gcc/testsuite/g++.dg/goacc/data-1.C | 16 ++--- 8 files changed, 199 insertions(+), 41 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/executeables-1.c diff --git a/gcc/c/ChangeLog b/gcc/c/ChangeLog index dec9c9d..f3626e2 100644 --- a/gcc/c/ChangeLog +++ b/gcc/c/ChangeLog @@ -1,3 +1,10 @@ +2016-12-02 Cesar Philippidis + James Norris + + * c-parser.c (c_parser_pragma): Error when PRAGMA_OACC_{ENTER_DATA, + EXIT_DATA,WAIT} are not used in compound statements. + (c_parser_oacc_enter_exit_data): Update diagnostics. + 2016-11-21 Bernd Edlinger PR c++/71973 diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 00fe731..f7bf9c4 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -10145,10 +10145,24 @@ c_parser_pragma (c_parser *parser, enum pragma_context context, bool *if_p) return false; case PRAGMA_OACC_ENTER_DATA: + if (context != pragma_compound) + { + if (context == pragma_stmt) + c_parser_error (parser, "%<#pragma acc enter data%> may only be " + "used in compound statements"); + goto bad_stmt; + } c_parser_oacc_enter_exit_data (parser, true); return false; case PRAGMA_OACC_EXIT_DATA: + if (context != pragma_compound) + { + if (context == pragma_stmt) + c_parser_error (parser, "%<#pragma acc exit data%> may only be " + "used in compound statements"); + goto bad_stmt; + } c_parser_oacc_enter_exit_data (parser, false); return false; @@ -10305,6 +10319,16 @@ c_parser_pragma (c_parser *parser, enum pragma_context context, bool *if_p) c_parser_cilk_grainsize (parser, if_p); return false; + case PRAGMA_OACC_WAIT: + if (context != pragma_compound) + { + if (context == pragma_stmt) + c_parser_error (parser, "%<#pragma acc enter data%> may only be " + "used in compound statements"); + goto bad_stmt; + } + /* FALL THROUGH. */ + default: if (id < PRAGMA_FIRST_EXTERNAL) { @@ -13871,28 +13895,26 @@ c_parser_oacc_enter_exit_data (c_parser *parser, bool enter) { location_t loc = c_parser_peek_token (parser)->location; tree clauses, stmt; + const char *p = ""; c_parser_consume_pragma (parser); - if (!c_parser_next_token_is (parser, CPP_NAME)) + if (c_parser_next_token_is (parser, CPP_NAME)) { - c_parser_error (parser, enter - ? "expected % in %<#pragma acc enter data%>" - : "expected % in %<#pragma acc exit data%>"); - c_parser_skip_to_pragma_eol (parser); - return; + p = IDENTIFIER_POINTER (c_parser_peek_token (parser)->value); + c_parser_consume_token (parser); } - const char *p = IDENTIFIER_POINTER (c_parser_peek_token (parser)->value); if (strcmp (p, "data") != 0) { - c_parser_error (parser, "invalid pragma"); + error_at (loc, enter + ? "expected % after %<#pragma acc enter%>" + : "expected % after %<#pragma acc exit%>"); + parser->error = true; c_parser_skip_to_pragma_eol (parser); return; } - c_parser_consume_token (parser); - if (enter) clauses = c_parser_oacc_all_clauses (parser, OACC_ENTER_DATA_CLAUSE_MASK, "#pragma acc enter data"); diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog index 4977ff2..d39c222 100644 --- a/gcc/cp/ChangeLog +++ b/gcc/cp/ChangeLog @@ -1,3 +1,10 @@ +2016-12-02 Cesar Philippidis + James Norris + + * parser.c (cp_parser_oacc_enter_exit_data): Update diagnostics. + (cp_parser_pragma): Error when PRAGMA_OACC_{ENTER_DATA, + EXIT_DATA,WAIT} are not used in compound statements. + 2016-12-01 Jason Merrill * call.c (add_function_candidate): Also exclude inherited ctors diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index 843cbe2..08f5f9e 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -36175,23 +36175,18 @@ static tree cp_parser_oacc_enter_exit_data (cp_parser *parser, cp_token *pragma_tok, bool enter) { + location_t loc = pragma_tok->location; tree stmt, clauses; + const char *p = ""; - if (cp_lexer_next_token_is (parser->lexer, CPP_PRAGMA_EOL) - || cp_lexer_next_token_is_not (parser->lexer, CPP_NAME)) - { - cp_parser_error (parser, enter - ? "expected % in %<#pragma acc enter data%>" - : "expected % in %<#pragma acc exit data%>"); - cp_parser_skip_to_pragma_eol (parser, pragma_tok); - return NULL_TREE; - } + if (cp_lexer_next_token_is (parser->lexer, CPP_NAME)) + p = IDENTIFIER_POINTER (cp_lexer_peek_token (parser->lexer)->u.value); - const char *p = - IDENTIFIER_POINTER (cp_lexer_peek_token (parser->lexer)->u.value); if (strcmp (p, "data") != 0) { - cp_parser_error (parser, "invalid pragma"); + error_at (loc, enter + ? "expected % after %<#pragma acc enter%>" + : "expected % after %<#pragma acc exit%>"); cp_parser_skip_to_pragma_eol (parser, pragma_tok); return NULL_TREE; } @@ -36207,8 +36202,8 @@ cp_parser_oacc_enter_exit_data (cp_parser *parser, cp_token *pragma_tok, if (find_omp_clause (clauses, OMP_CLAUSE_MAP) == NULL_TREE) { - error_at (pragma_tok->location, - "%<#pragma acc enter data%> has no data movement clause"); + error_at (loc, "%<#pragma acc %s data%> has no data movement clause", + enter ? "enter" : "exit"); return NULL_TREE; } @@ -38083,6 +38078,30 @@ cp_parser_pragma (cp_parser *parser, enum pragma_context context, bool *if_p) cp_parser_oacc_declare (parser, pragma_tok); return false; + case PRAGMA_OACC_ENTER_DATA: + if (context == pragma_stmt) + { + cp_parser_error (parser, "%<#pragma acc enter data%> may only be " + "used in compound statements"); + break; + } + else if (context != pragma_compound) + goto bad_stmt; + cp_parser_omp_construct (parser, pragma_tok, if_p); + return true; + + case PRAGMA_OACC_EXIT_DATA: + if (context == pragma_stmt) + { + cp_parser_error (parser, "%<#pragma acc exit data%> may only be " + "used in compound statements"); + break; + } + else if (context != pragma_compound) + goto bad_stmt; + cp_parser_omp_construct (parser, pragma_tok, if_p); + return true; + case PRAGMA_OACC_ROUTINE: if (context != pragma_external) { @@ -38093,17 +38112,37 @@ cp_parser_pragma (cp_parser *parser, enum pragma_context context, bool *if_p) cp_parser_oacc_routine (parser, pragma_tok, context); return false; + case PRAGMA_OACC_UPDATE: + if (context == pragma_stmt) + { + cp_parser_error (parser, "%<#pragma acc update%> may only be " + "used in compound statements"); + break; + } + else if (context != pragma_compound) + goto bad_stmt; + cp_parser_omp_construct (parser, pragma_tok, if_p); + return true; + + case PRAGMA_OACC_WAIT: + if (context == pragma_stmt) + { + cp_parser_error (parser, "%<#pragma acc wait%> may only be " + "used in compound statements"); + break; + } + else if (context != pragma_compound) + goto bad_stmt; + cp_parser_omp_construct (parser, pragma_tok, if_p); + return true; + case PRAGMA_OACC_ATOMIC: case PRAGMA_OACC_CACHE: case PRAGMA_OACC_DATA: - case PRAGMA_OACC_ENTER_DATA: - case PRAGMA_OACC_EXIT_DATA: case PRAGMA_OACC_HOST_DATA: case PRAGMA_OACC_KERNELS: case PRAGMA_OACC_PARALLEL: case PRAGMA_OACC_LOOP: - case PRAGMA_OACC_UPDATE: - case PRAGMA_OACC_WAIT: case PRAGMA_OMP_ATOMIC: case PRAGMA_OMP_CRITICAL: case PRAGMA_OMP_DISTRIBUTE: diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 9970806..f1b5c35 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2016-12-02 Cesar Philippidis + James Norris + + * c-c++-common/goacc/data-2.c: Adjust test. + * c-c++-common/goacc/executeables-1.c: New test. + * g++.dg/goacc/data-1.C: Adjust test. + 2016-12-02 James Greenhalgh PR rtl-optimization/78561 diff --git a/gcc/testsuite/c-c++-common/goacc/data-2.c b/gcc/testsuite/c-c++-common/goacc/data-2.c index a67d8a4..1043bf8a 100644 --- a/gcc/testsuite/c-c++-common/goacc/data-2.c +++ b/gcc/testsuite/c-c++-common/goacc/data-2.c @@ -10,12 +10,14 @@ foo (void) #pragma acc exit data delete (a) if (0) #pragma acc exit data copyout (b) if (a) #pragma acc exit data delete (b) -#pragma acc enter /* { dg-error "expected 'data' in" } */ -#pragma acc exit /* { dg-error "expected 'data' in" } */ +#pragma acc enter /* { dg-error "expected 'data' after" } */ +#pragma acc exit /* { dg-error "expected 'data' after" } */ #pragma acc enter data /* { dg-error "has no data movement clause" } */ -#pragma acc exit data /* { dg-error "has no data movement clause" } */ -#pragma acc enter Data /* { dg-error "invalid pragma before" } */ -#pragma acc exit copyout (b) /* { dg-error "invalid pragma before" } */ +#pragma acc exit data /* { dg-error "no data movement clause" } */ +#pragma acc enter Data /* { dg-error "expected 'data' after" } */ +#pragma acc exit copyout (b) /* { dg-error "expected 'data' after" } */ +#pragma acc enter for /* { dg-error "expected 'data' after" } */ +#pragma acc enter data2 /* { dg-error "expected 'data' after" } */ } /* { dg-error "has no data movement clause" "" { target *-*-* } 8 } */ diff --git a/gcc/testsuite/c-c++-common/goacc/executeables-1.c b/gcc/testsuite/c-c++-common/goacc/executeables-1.c new file mode 100644 index 0000000..da89437 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/executeables-1.c @@ -0,0 +1,74 @@ +/* { dg-do compile } */ + +void +foo (void) +{ + const int N = 32; + float x[N], y[N]; + int flag = 0; + + if (flag) +#pragma acc update host (x[0:N]) /* { dg-error "may only be used in compound statements" } */ + flag = 1; + + while (flag) +#pragma acc update host (x[0:N]) /* { dg-error "may only be used in compound statements" } */ + flag = 2; + +#pragma acc enter data create (x[0:N]) + { + if (flag) +#pragma acc update host (x[0:N]) /* { dg-error "may only be used in compound statements" } */ + flag = 1; + } + + if (flag) + while (flag) +#pragma acc update host (x[0:N]) /* { dg-error "may only be used in compound statements" } */ + flag = 2; + + if (flag) +#pragma acc wait /* { dg-error "may only be used in compound statements" } */ + flag = 1; + + while (flag) +#pragma acc wait /* { dg-error "may only be used in compound statements" } */ + flag = 2; + +#pragma acc enter data create (x[0:N]) + { + if (flag) +#pragma acc wait /* { dg-error "may only be used in compound statements" } */ + flag = 1; + } + + if (flag) +#pragma acc enter data create (x[0:N]) /* { dg-error "may only be used in compound statements" } */ + flag = 1; + + while (flag) +#pragma acc enter data create (x[0:N]) /* { dg-error "may only be used in compound statements" } */ + flag = 2; + +#pragma acc enter data create (x[0:N]) + { + if (flag) +#pragma acc enter data create (y[0:N]) /* { dg-error "may only be used in compound statements" } */ + flag = 1; + } + + if (flag) +#pragma acc exit data delete (x[0:N]) /* { dg-error "may only be used in compound statements" } */ + flag = 1; + + while (flag) +#pragma acc exit data delete (x[0:N]) /* { dg-error "may only be used in compound statements" } */ + flag = 2; + +#pragma acc enter data create (x[0:N]) + { + if (flag) +#pragma acc exit data delete (x[0:N]) /* { dg-error "may only be used in compound statements" } */ + flag = 1; + } +} diff --git a/gcc/testsuite/g++.dg/goacc/data-1.C b/gcc/testsuite/g++.dg/goacc/data-1.C index 54676dc..2b210dc 100644 --- a/gcc/testsuite/g++.dg/goacc/data-1.C +++ b/gcc/testsuite/g++.dg/goacc/data-1.C @@ -8,12 +8,12 @@ foo (int &a, int (&b)[100], int &n) #pragma acc exit data delete (a) if (0) #pragma acc exit data copyout (b) if (a) #pragma acc exit data delete (b) -#pragma acc enter /* { dg-error "expected 'data' in" } */ -#pragma acc exit /* { dg-error "expected 'data' in" } */ +#pragma acc enter /* { dg-error "expected 'data' after" } */ +#pragma acc exit /* { dg-error "expected 'data' after" } */ #pragma acc enter data /* { dg-error "has no data movement clause" } */ #pragma acc exit data /* { dg-error "has no data movement clause" } */ -#pragma acc enter Data /* { dg-error "invalid pragma before" } */ -#pragma acc exit copyout (b) /* { dg-error "invalid pragma before" } */ +#pragma acc enter Data /* { dg-error "expected 'data' after" } */ +#pragma acc exit copyout (b) /* { dg-error "expected 'data' after" } */ } template @@ -27,12 +27,12 @@ foo (T &a, T (&b)[100], T &n) #pragma acc exit data delete (a) if (0) #pragma acc exit data copyout (b) if (a) #pragma acc exit data delete (b) -#pragma acc enter /* { dg-error "expected 'data' in" } */ -#pragma acc exit /* { dg-error "expected 'data' in" } */ +#pragma acc enter /* { dg-error "expected 'data' after" } */ +#pragma acc exit /* { dg-error "expected 'data' after" } */ #pragma acc enter data /* { dg-error "has no data movement clause" } */ #pragma acc exit data /* { dg-error "has no data movement clause" } */ -#pragma acc enter Data /* { dg-error "invalid pragma before" } */ -#pragma acc exit copyout (b) /* { dg-error "invalid pragma before" } */ +#pragma acc enter Data /* { dg-error "expected 'data' after" } */ +#pragma acc exit copyout (b) /* { dg-error "expected 'data' after" } */ } /* { dg-error "has no data movement clause" "" { target *-*-* } 6 } */ -- cgit v1.1 From f1bca06f624245fde8a485deb2a589ba5d752537 Mon Sep 17 00:00:00 2001 From: Georg-Johann Lay Date: Fri, 2 Dec 2016 15:08:27 +0000 Subject: avr.c: Fix coding rule glitches. * config/avr/avr.c: Fix coding rule glitches. From-SVN: r243186 --- gcc/ChangeLog | 6 +- gcc/config/avr/avr.c | 361 +++++++++++++++++++++++++-------------------------- 2 files changed, 184 insertions(+), 183 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 0880c84..5849b0f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,4 +1,8 @@ -2016-12-09 Martin Jambor +2016-12-02 Georg-Johann Lay + + * config/avr/avr.c: Fix coding rule glitches. + +2016-12-02 Martin Jambor * hsa.c (hsa_callable_function_p): Return false for artificial functions. diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c index 48f4788..8ceeff4 100644 --- a/gcc/config/avr/avr.c +++ b/gcc/config/avr/avr.c @@ -747,8 +747,8 @@ avr_option_override (void) introduces additional code in LIM and increases reg pressure. */ maybe_set_param_value (PARAM_ALLOW_STORE_DATA_RACES, 1, - global_options.x_param_values, - global_options_set.x_param_values); + global_options.x_param_values, + global_options_set.x_param_values); /* Unwind tables currently require a frame pointer for correctness, see toplev.c:process_options(). */ @@ -1034,7 +1034,7 @@ avr_set_current_function (tree decl) if (cfun->machine->is_OS_task + cfun->machine->is_OS_main + (cfun->machine->is_signal || cfun->machine->is_interrupt) > 1) error_at (loc, "function attributes %qs, %qs and %qs are mutually" - " exclusive", "OS_task", "OS_main", isr); + " exclusive", "OS_task", "OS_main", isr); /* 'naked' will hide effects of 'OS_task' and 'OS_main'. */ @@ -1299,7 +1299,7 @@ avr_return_addr_rtx (int count, rtx tem) /* Can only return this function's return address. Others not supported. */ if (count) - return NULL; + return NULL; if (AVR_3_BYTE_PC) { @@ -1313,7 +1313,7 @@ avr_return_addr_rtx (int count, rtx tem) r = gen_rtx_PLUS (Pmode, tem, r); r = gen_frame_mem (Pmode, memory_address (Pmode, r)); r = gen_rtx_ROTATE (HImode, r, GEN_INT (8)); - return r; + return r; } /* Return 1 if the function epilogue is just a single "ret". */ @@ -2093,7 +2093,6 @@ avr_asm_function_begin_epilogue (FILE *file) static bool avr_cannot_modify_jumps_p (void) { - /* Naked Functions must not have any instructions after their epilogue, see PR42240 */ @@ -2698,7 +2697,7 @@ avr_print_operand (FILE *file, rtx x, int code) else if (code == 'b') { if (GET_CODE (addr) != PLUS) - fatal_insn ("bad address, not (reg+disp):", addr); + fatal_insn ("bad address, not (reg+disp):", addr); avr_print_operand_address (file, VOIDmode, XEXP (addr, 0)); } @@ -2708,7 +2707,7 @@ avr_print_operand (FILE *file, rtx x, int code) fatal_insn ("bad address, not post_inc or pre_dec:", addr); if (code == 'p') - /* X, Y, Z */ + /* X, Y, Z */ avr_print_operand_address (file, VOIDmode, XEXP (addr, 0)); else avr_print_operand (file, XEXP (addr, 0), 0); /* r26, r28, r30 */ @@ -3723,8 +3722,6 @@ output_movhi (rtx_insn *insn, rtx xop[], int *plen) return avr_out_lpm (insn, xop, plen); } - gcc_assert (2 == GET_MODE_SIZE (GET_MODE (dest))); - if (REG_P (dest)) { if (REG_P (src)) /* mov r,r */ @@ -3825,8 +3822,8 @@ out_movqi_r_mr (rtx_insn *insn, rtx op[], int *plen) } if (GET_CODE (x) == PLUS - && REG_P (XEXP (x, 0)) - && CONST_INT_P (XEXP (x, 1))) + && REG_P (XEXP (x, 0)) + && CONST_INT_P (XEXP (x, 1))) { /* memory access by reg+disp */ @@ -4016,7 +4013,7 @@ out_movhi_r_mr (rtx_insn *insn, rtx op[], int *plen) "ldd %B0,Y+63" CR_TAB "sbiw r28,%o1-62", op, plen, -4) - : avr_asm_len ("subi r28,lo8(-%o1)" CR_TAB + : avr_asm_len ("subi r28,lo8(-%o1)" CR_TAB "sbci r29,hi8(-%o1)" CR_TAB "ld %A0,Y" CR_TAB "ldd %B0,Y+1" CR_TAB @@ -4385,7 +4382,7 @@ avr_out_movsi_mr_r_reg_no_disp_tiny (rtx_insn *insn, rtx op[], int *l) if (reg_base == reg_src) { - /* "ld r26,-X" is undefined */ + /* "ld r26,-X" is undefined */ if (reg_unused_after (insn, base)) { return *l = 7, ("mov __tmp_reg__, %B1" CR_TAB @@ -4672,6 +4669,7 @@ output_movsisf (rtx_insn *insn, rtx operands[], int *l) l = &dummy; gcc_assert (4 == GET_MODE_SIZE (GET_MODE (dest))); + if (REG_P (dest)) { if (REG_P (src)) /* mov r,r */ @@ -4717,7 +4715,7 @@ output_movsisf (rtx_insn *insn, rtx operands[], int *l) const char *templ; if (src == CONST0_RTX (GET_MODE (dest))) - operands[1] = zero_reg_rtx; + operands[1] = zero_reg_rtx; templ = out_movsi_mr_r (insn, operands, real_l); @@ -4785,7 +4783,7 @@ avr_out_load_psi_reg_disp_tiny (rtx_insn *insn, rtx *op, int *plen) TINY_SBIW (%I1, %J1, 1) CR_TAB "ld %A0,%b1" CR_TAB "mov %B0,__tmp_reg__", op, plen, -8); - } + } else { avr_asm_len (TINY_ADIW (%I1, %J1, %o1) CR_TAB @@ -4914,9 +4912,9 @@ avr_out_load_psi (rtx_insn *insn, rtx *op, int *plen) "ldd %A0,%A1" CR_TAB "mov %B0,__tmp_reg__", op, plen, -4); - return avr_asm_len ("ldd %A0,%A1" CR_TAB - "ldd %B0,%B1" CR_TAB - "ldd %C0,%C1", op, plen, -3); + return avr_asm_len ("ldd %A0,%A1" CR_TAB + "ldd %B0,%B1" CR_TAB + "ldd %C0,%C1", op, plen, -3); } else if (GET_CODE (base) == PRE_DEC) /* (--R) */ return avr_asm_len ("ld %C0,%1" CR_TAB @@ -5191,14 +5189,14 @@ avr_out_movqi_mr_r_reg_disp_tiny (rtx_insn *insn, rtx op[], int *plen) TINY_ADIW (%I0, %J0, %o0) CR_TAB "st %b0,__tmp_reg__", op, plen, -4); } - else + else { avr_asm_len (TINY_ADIW (%I0, %J0, %o0) CR_TAB "st %b0,%1", op, plen, -3); } if (!reg_unused_after (insn, XEXP (x, 0))) - avr_asm_len (TINY_SBIW (%I0, %J0, %o0), op, plen, 2); + avr_asm_len (TINY_SBIW (%I0, %J0, %o0), op, plen, 2); return ""; } @@ -5410,11 +5408,11 @@ avr_out_movhi_mr_r_reg_no_disp_tiny (rtx_insn *insn, rtx op[], int *plen) } return !mem_volatile_p && reg_unused_after (insn, base) - ? avr_asm_len ("st %0+,%A1" CR_TAB - "st %0,%B1", op, plen, -2) - : avr_asm_len (TINY_ADIW (%E0, %F0, 1) CR_TAB - "st %0,%B1" CR_TAB - "st -%0,%A1", op, plen, -4); + ? avr_asm_len ("st %0+,%A1" CR_TAB + "st %0,%B1", op, plen, -2) + : avr_asm_len (TINY_ADIW (%E0, %F0, 1) CR_TAB + "st %0,%B1" CR_TAB + "st -%0,%A1", op, plen, -4); } static const char* @@ -5797,8 +5795,8 @@ avr_out_compare (rtx_insn *insn, rtx *xop, int *plen) && reg_unused_after (insn, xreg)) { return AVR_TINY - ? avr_asm_len (TINY_ADIW (%A0, %B0, %n1), xop, plen, 2) - : avr_asm_len ("adiw %0,%n1", xop, plen, 1); + ? avr_asm_len (TINY_ADIW (%A0, %B0, %n1), xop, plen, 2) + : avr_asm_len ("adiw %0,%n1", xop, plen, 1); } } @@ -5973,7 +5971,7 @@ out_shift_with_cnt (const char *templ, rtx_insn *insn, rtx operands[], int max_len = 10; /* If larger than this, always use a loop. */ if (count <= 0) - return; + return; if (count < 8 && !scratch) use_zero_reg = true; @@ -6044,7 +6042,7 @@ out_shift_with_cnt (const char *templ, rtx_insn *insn, rtx operands[], fatal_insn ("bad shift insn:", insn); if (second_label) - avr_asm_len ("rjmp 2f", op, plen, 1); + avr_asm_len ("rjmp 2f", op, plen, 1); avr_asm_len ("1:", op, plen, 0); avr_asm_len (templ, op, plen, t_len); @@ -8774,9 +8772,9 @@ avr_out_fract (rtx_insn *insn, rtx operands[], bool intsigned, int *plen) xop[3] = all_regs_rtx[dest.regno_msb]; avr_asm_len ("ldi %3,127", xop, plen, 1); avr_asm_len ((have_carry && lsb_in_tmp_reg ? "adc __tmp_reg__,%3" - : have_carry ? "adc %2,%3" - : lsb_in_tmp_reg ? "add __tmp_reg__,%3" - : "add %2,%3"), + : have_carry ? "adc %2,%3" + : lsb_in_tmp_reg ? "add __tmp_reg__,%3" + : "add %2,%3"), xop, plen, 1); } else @@ -8860,7 +8858,7 @@ avr_out_fract (rtx_insn *insn, rtx operands[], bool intsigned, int *plen) "lsl __tmp_reg__", &all_regs_rtx[s0], plen, 2); sign_in_carry = true; - } + } gcc_assert (sign_in_carry + msb_in_carry + lsb_in_carry <= 1); @@ -8979,150 +8977,150 @@ avr_out_round (rtx_insn *insn ATTRIBUTE_UNUSED, rtx *xop, int *plen) /* Create RTL split patterns for byte sized rotate expressions. This - produces a series of move instructions and considers overlap situations. - Overlapping non-HImode operands need a scratch register. */ + produces a series of move instructions and considers overlap situations. + Overlapping non-HImode operands need a scratch register. */ bool avr_rotate_bytes (rtx operands[]) { - machine_mode mode = GET_MODE (operands[0]); - bool overlapped = reg_overlap_mentioned_p (operands[0], operands[1]); - bool same_reg = rtx_equal_p (operands[0], operands[1]); - int num = INTVAL (operands[2]); - rtx scratch = operands[3]; - /* Work out if byte or word move is needed. Odd byte rotates need QImode. - Word move if no scratch is needed, otherwise use size of scratch. */ - machine_mode move_mode = QImode; - int move_size, offset, size; - - if (num & 0xf) - move_mode = QImode; - else if ((mode == SImode && !same_reg) || !overlapped) - move_mode = HImode; - else - move_mode = GET_MODE (scratch); - - /* Force DI rotate to use QI moves since other DI moves are currently split - into QI moves so forward propagation works better. */ - if (mode == DImode) - move_mode = QImode; - /* Make scratch smaller if needed. */ - if (SCRATCH != GET_CODE (scratch) - && HImode == GET_MODE (scratch) - && QImode == move_mode) - scratch = simplify_gen_subreg (move_mode, scratch, HImode, 0); - - move_size = GET_MODE_SIZE (move_mode); - /* Number of bytes/words to rotate. */ - offset = (num >> 3) / move_size; - /* Number of moves needed. */ - size = GET_MODE_SIZE (mode) / move_size; - /* Himode byte swap is special case to avoid a scratch register. */ - if (mode == HImode && same_reg) - { - /* HImode byte swap, using xor. This is as quick as using scratch. */ - rtx src, dst; - src = simplify_gen_subreg (move_mode, operands[1], mode, 0); - dst = simplify_gen_subreg (move_mode, operands[0], mode, 1); - if (!rtx_equal_p (dst, src)) - { - emit_move_insn (dst, gen_rtx_XOR (QImode, dst, src)); - emit_move_insn (src, gen_rtx_XOR (QImode, src, dst)); - emit_move_insn (dst, gen_rtx_XOR (QImode, dst, src)); - } - } - else - { + machine_mode mode = GET_MODE (operands[0]); + bool overlapped = reg_overlap_mentioned_p (operands[0], operands[1]); + bool same_reg = rtx_equal_p (operands[0], operands[1]); + int num = INTVAL (operands[2]); + rtx scratch = operands[3]; + /* Work out if byte or word move is needed. Odd byte rotates need QImode. + Word move if no scratch is needed, otherwise use size of scratch. */ + machine_mode move_mode = QImode; + int move_size, offset, size; + + if (num & 0xf) + move_mode = QImode; + else if ((mode == SImode && !same_reg) || !overlapped) + move_mode = HImode; + else + move_mode = GET_MODE (scratch); + + /* Force DI rotate to use QI moves since other DI moves are currently split + into QI moves so forward propagation works better. */ + if (mode == DImode) + move_mode = QImode; + /* Make scratch smaller if needed. */ + if (SCRATCH != GET_CODE (scratch) + && HImode == GET_MODE (scratch) + && QImode == move_mode) + scratch = simplify_gen_subreg (move_mode, scratch, HImode, 0); + + move_size = GET_MODE_SIZE (move_mode); + /* Number of bytes/words to rotate. */ + offset = (num >> 3) / move_size; + /* Number of moves needed. */ + size = GET_MODE_SIZE (mode) / move_size; + /* Himode byte swap is special case to avoid a scratch register. */ + if (mode == HImode && same_reg) + { + /* HImode byte swap, using xor. This is as quick as using scratch. */ + rtx src, dst; + src = simplify_gen_subreg (move_mode, operands[1], mode, 0); + dst = simplify_gen_subreg (move_mode, operands[0], mode, 1); + if (!rtx_equal_p (dst, src)) + { + emit_move_insn (dst, gen_rtx_XOR (QImode, dst, src)); + emit_move_insn (src, gen_rtx_XOR (QImode, src, dst)); + emit_move_insn (dst, gen_rtx_XOR (QImode, dst, src)); + } + } + else + { #define MAX_SIZE 8 /* GET_MODE_SIZE (DImode) / GET_MODE_SIZE (QImode) */ - /* Create linked list of moves to determine move order. */ - struct { - rtx src, dst; - int links; - } move[MAX_SIZE + 8]; - int blocked, moves; - - gcc_assert (size <= MAX_SIZE); - /* Generate list of subreg moves. */ - for (int i = 0; i < size; i++) - { - int from = i; - int to = (from + offset) % size; - move[i].src = simplify_gen_subreg (move_mode, operands[1], - mode, from * move_size); - move[i].dst = simplify_gen_subreg (move_mode, operands[0], - mode, to * move_size); - move[i].links = -1; - } - /* Mark dependence where a dst of one move is the src of another move. - The first move is a conflict as it must wait until second is - performed. We ignore moves to self - we catch this later. */ - if (overlapped) - for (int i = 0; i < size; i++) - if (reg_overlap_mentioned_p (move[i].dst, operands[1])) - for (int j = 0; j < size; j++) - if (j != i && rtx_equal_p (move[j].src, move[i].dst)) - { - /* The dst of move i is the src of move j. */ - move[i].links = j; - break; - } - - blocked = -1; - moves = 0; - /* Go through move list and perform non-conflicting moves. As each - non-overlapping move is made, it may remove other conflicts - so the process is repeated until no conflicts remain. */ - do - { - blocked = -1; - moves = 0; - /* Emit move where dst is not also a src or we have used that - src already. */ - for (int i = 0; i < size; i++) - if (move[i].src != NULL_RTX) - { - if (move[i].links == -1 - || move[move[i].links].src == NULL_RTX) - { - moves++; - /* Ignore NOP moves to self. */ - if (!rtx_equal_p (move[i].dst, move[i].src)) - emit_move_insn (move[i].dst, move[i].src); - - /* Remove conflict from list. */ - move[i].src = NULL_RTX; - } - else - blocked = i; - } + /* Create linked list of moves to determine move order. */ + struct { + rtx src, dst; + int links; + } move[MAX_SIZE + 8]; + int blocked, moves; + + gcc_assert (size <= MAX_SIZE); + /* Generate list of subreg moves. */ + for (int i = 0; i < size; i++) + { + int from = i; + int to = (from + offset) % size; + move[i].src = simplify_gen_subreg (move_mode, operands[1], + mode, from * move_size); + move[i].dst = simplify_gen_subreg (move_mode, operands[0], + mode, to * move_size); + move[i].links = -1; + } + /* Mark dependence where a dst of one move is the src of another move. + The first move is a conflict as it must wait until second is + performed. We ignore moves to self - we catch this later. */ + if (overlapped) + for (int i = 0; i < size; i++) + if (reg_overlap_mentioned_p (move[i].dst, operands[1])) + for (int j = 0; j < size; j++) + if (j != i && rtx_equal_p (move[j].src, move[i].dst)) + { + /* The dst of move i is the src of move j. */ + move[i].links = j; + break; + } - /* Check for deadlock. This is when no moves occurred and we have - at least one blocked move. */ - if (moves == 0 && blocked != -1) - { - /* Need to use scratch register to break deadlock. - Add move to put dst of blocked move into scratch. - When this move occurs, it will break chain deadlock. - The scratch register is substituted for real move. */ - - gcc_assert (SCRATCH != GET_CODE (scratch)); - - move[size].src = move[blocked].dst; - move[size].dst = scratch; - /* Scratch move is never blocked. */ - move[size].links = -1; - /* Make sure we have valid link. */ - gcc_assert (move[blocked].links != -1); - /* Replace src of blocking move with scratch reg. */ - move[move[blocked].links].src = scratch; - /* Make dependent on scratch move occurring. */ - move[blocked].links = size; - size=size+1; - } - } - while (blocked != -1); - } - return true; + blocked = -1; + moves = 0; + /* Go through move list and perform non-conflicting moves. As each + non-overlapping move is made, it may remove other conflicts + so the process is repeated until no conflicts remain. */ + do + { + blocked = -1; + moves = 0; + /* Emit move where dst is not also a src or we have used that + src already. */ + for (int i = 0; i < size; i++) + if (move[i].src != NULL_RTX) + { + if (move[i].links == -1 + || move[move[i].links].src == NULL_RTX) + { + moves++; + /* Ignore NOP moves to self. */ + if (!rtx_equal_p (move[i].dst, move[i].src)) + emit_move_insn (move[i].dst, move[i].src); + + /* Remove conflict from list. */ + move[i].src = NULL_RTX; + } + else + blocked = i; + } + + /* Check for deadlock. This is when no moves occurred and we have + at least one blocked move. */ + if (moves == 0 && blocked != -1) + { + /* Need to use scratch register to break deadlock. + Add move to put dst of blocked move into scratch. + When this move occurs, it will break chain deadlock. + The scratch register is substituted for real move. */ + + gcc_assert (SCRATCH != GET_CODE (scratch)); + + move[size].src = move[blocked].dst; + move[size].dst = scratch; + /* Scratch move is never blocked. */ + move[size].links = -1; + /* Make sure we have valid link. */ + gcc_assert (move[blocked].links != -1); + /* Replace src of blocking move with scratch reg. */ + move[move[blocked].links].src = scratch; + /* Make dependent on scratch move occurring. */ + move[blocked].links = size; + size=size+1; + } + } + while (blocked != -1); + } + return true; } @@ -9900,7 +9898,6 @@ avr_asm_output_aligned_decl_common (FILE * stream, && SYMBOL_REF_P ((symbol = XEXP (mem, 0))) && (SYMBOL_REF_FLAGS (symbol) & (SYMBOL_FLAG_IO | SYMBOL_FLAG_ADDRESS))) { - if (!local_p) { fprintf (stream, "\t.globl\t"); @@ -10139,7 +10136,7 @@ avr_encode_section_info (tree decl, rtx rtl, int new_decl_p) && TREE_CODE (decl) != FUNCTION_DECL && MEM_P (rtl) && SYMBOL_REF_P (XEXP (rtl, 0))) - { + { rtx sym = XEXP (rtl, 0); tree type = TREE_TYPE (decl); tree attr = DECL_ATTRIBUTES (decl); @@ -10345,7 +10342,7 @@ avr_adjust_reg_alloc_order (void) 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 0, 1, 32, 33, 34, 35 - }; + }; static const int tiny_order_0[] = { 20, 21, 22, 23, @@ -10366,7 +10363,7 @@ avr_adjust_reg_alloc_order (void) 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 0, 1, 32, 33, 34, 35 - }; + }; static const int tiny_order_1[] = { 22, 23, 24, 25, @@ -10386,7 +10383,7 @@ avr_adjust_reg_alloc_order (void) 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 32, 33, 34, 35 - }; + }; /* Select specific register allocation order. Tiny Core (ATtiny4/5/9/10/20/40) devices have only 16 registers, @@ -10397,7 +10394,7 @@ avr_adjust_reg_alloc_order (void) : (AVR_TINY ? tiny_order_0 : order_0)); for (size_t i = 0; i < ARRAY_SIZE (order_0); ++i) - reg_alloc_order[i] = order[i]; + reg_alloc_order[i] = order[i]; } @@ -10767,10 +10764,10 @@ avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, *total = COSTS_N_INSNS (AVR_HAVE_JMP_CALL ? 5 : 4); } - if (mode == DImode) - *total *= 2; + if (mode == DImode) + *total *= 2; - return true; + return true; default: return false; @@ -13187,7 +13184,7 @@ avr_expand_delay_cycles (rtx operands0) emit_insn (gen_delay_cycles_1 (gen_int_mode (loop_count, QImode), avr_mem_clobber())); cycles -= cycles_used; - } + } while (cycles >= 2) { -- cgit v1.1 From de7b57234525f55e5edfe8db77ca7ac2a943468f Mon Sep 17 00:00:00 2001 From: Andre Vieira Date: Fri, 2 Dec 2016 15:22:43 +0000 Subject: Add support for ARMv8-M's Secure Extensions flag and intrinsics gcc/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * config.gcc (extra_headers): Added arm_cmse.h. * config/arm/arm-arches.def (ARM_ARCH): (armv8-m): Add FL2_CMSE. (armv8-m.main): Likewise. (armv8-m.main+dsp): Likewise. * config/arm/arm-c.c (arm_cpu_builtins): Added __ARM_FEATURE_CMSE macro. * config/arm/arm-flags.h: Define FL2_CMSE. * config/arm.c (arm_arch_cmse): New. (arm_option_override): New error for unsupported cmse target. * config/arm/arm.h (arm_arch_cmse): New. * config/arm/arm.opt (mcmse): New. * config/arm/arm_cmse.h: New file. * doc/invoke.texi (ARM Options): Add -mcmse. * doc/sourcebuild.texi (arm_cmse_ok): Add new effective target. * doc/extend.texi: Add ARMv8-M Security Extensions entry. gcc/testsuite/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * gcc.target/arm/cmse/cmse.exp: New. * gcc.target/arm/cmse/cmse-1.c: New. * gcc.target/arm/cmse/cmse-12.c: New. * lib/target-supports.exp (check_effective_target_arm_cmse_ok): New. libgcc/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * config/arm/t-arm (HAVE_CMSE): New. * config/arm/cmse.c: New. Co-Authored-By: Thomas Preud'homme From-SVN: r243187 --- gcc/ChangeLog | 20 +++ gcc/config.gcc | 2 +- gcc/config/arm/arm-arches.def | 6 +- gcc/config/arm/arm-c.c | 8 ++ gcc/config/arm/arm-flags.h | 1 + gcc/config/arm/arm.c | 7 + gcc/config/arm/arm.h | 3 + gcc/config/arm/arm.opt | 4 + gcc/config/arm/arm_cmse.h | 192 ++++++++++++++++++++++++++++ gcc/doc/extend.texi | 26 ++++ gcc/doc/invoke.texi | 8 +- gcc/doc/sourcebuild.texi | 4 + gcc/testsuite/ChangeLog | 9 ++ gcc/testsuite/gcc.target/arm/cmse/cmse-1.c | 67 ++++++++++ gcc/testsuite/gcc.target/arm/cmse/cmse-12.c | 14 ++ gcc/testsuite/gcc.target/arm/cmse/cmse.exp | 50 ++++++++ gcc/testsuite/lib/target-supports.exp | 13 ++ libgcc/ChangeLog | 6 + libgcc/config/arm/cmse.c | 108 ++++++++++++++++ libgcc/config/arm/t-arm | 12 ++ 20 files changed, 555 insertions(+), 5 deletions(-) create mode 100644 gcc/config/arm/arm_cmse.h create mode 100644 gcc/testsuite/gcc.target/arm/cmse/cmse-1.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/cmse-12.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/cmse.exp create mode 100644 libgcc/config/arm/cmse.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5849b0f..f24e99e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,23 @@ +2016-12-02 Andre Vieira + Thomas Preud'homme + + * config.gcc (extra_headers): Added arm_cmse.h. + * config/arm/arm-arches.def (ARM_ARCH): + (armv8-m): Add FL2_CMSE. + (armv8-m.main): Likewise. + (armv8-m.main+dsp): Likewise. + * config/arm/arm-c.c + (arm_cpu_builtins): Added __ARM_FEATURE_CMSE macro. + * config/arm/arm-flags.h: Define FL2_CMSE. + * config/arm.c (arm_arch_cmse): New. + (arm_option_override): New error for unsupported cmse target. + * config/arm/arm.h (arm_arch_cmse): New. + * config/arm/arm.opt (mcmse): New. + * config/arm/arm_cmse.h: New file. + * doc/invoke.texi (ARM Options): Add -mcmse. + * doc/sourcebuild.texi (arm_cmse_ok): Add new effective target. + * doc/extend.texi: Add ARMv8-M Security Extensions entry. + 2016-12-02 Georg-Johann Lay * config/avr/avr.c: Fix coding rule glitches. diff --git a/gcc/config.gcc b/gcc/config.gcc index 98267d8..1fa34ac 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -323,7 +323,7 @@ arc*-*-*) arm*-*-*) cpu_type=arm extra_objs="arm-builtins.o aarch-common.o" - extra_headers="mmintrin.h arm_neon.h arm_acle.h arm_fp16.h" + extra_headers="mmintrin.h arm_neon.h arm_acle.h arm_fp16.h arm_cmse.h" target_type_format_char='%' c_target_objs="arm-c.o" cxx_target_objs="arm-c.o" diff --git a/gcc/config/arm/arm-arches.def b/gcc/config/arm/arm-arches.def index cd79bc5..71cabcc 100644 --- a/gcc/config/arm/arm-arches.def +++ b/gcc/config/arm/arm-arches.def @@ -70,10 +70,10 @@ ARM_ARCH ("armv8.2-a+fp16", cortexa53, 8A, ARM_FSET_MAKE (FL_CO_PROC | FL_CRC32 | FL_FOR_ARCH8A, FL2_FOR_ARCH8_2A | FL2_FP16INST)) ARM_ARCH("armv8-m.base", cortexm23, 8M_BASE, - ARM_FSET_MAKE_CPU1 ( FL_FOR_ARCH8M_BASE)) + ARM_FSET_MAKE (FL_FOR_ARCH8M_BASE, FL2_CMSE)) ARM_ARCH("armv8-m.main", cortexm7, 8M_MAIN, - ARM_FSET_MAKE_CPU1(FL_CO_PROC | FL_FOR_ARCH8M_MAIN)) + ARM_FSET_MAKE (FL_CO_PROC | FL_FOR_ARCH8M_MAIN, FL2_CMSE)) ARM_ARCH("armv8-m.main+dsp", cortexm33, 8M_MAIN, - ARM_FSET_MAKE_CPU1(FL_CO_PROC | FL_ARCH7EM | FL_FOR_ARCH8M_MAIN)) + ARM_FSET_MAKE (FL_CO_PROC | FL_ARCH7EM | FL_FOR_ARCH8M_MAIN, FL2_CMSE)) ARM_ARCH("iwmmxt", iwmmxt, 5TE, ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT)) ARM_ARCH("iwmmxt2", iwmmxt2, 5TE, ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT | FL_IWMMXT2)) diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c index 74417a6..b592134 100644 --- a/gcc/config/arm/arm-c.c +++ b/gcc/config/arm/arm-c.c @@ -77,6 +77,14 @@ arm_cpu_builtins (struct cpp_reader* pfile) def_or_undef_macro (pfile, "__ARM_32BIT_STATE", TARGET_32BIT); + if (arm_arch8 && !arm_arch_notm) + { + if (arm_arch_cmse && use_cmse) + builtin_define_with_int_value ("__ARM_FEATURE_CMSE", 3); + else + builtin_define ("__ARM_FEATURE_CMSE"); + } + if (TARGET_ARM_FEATURE_LDREX) builtin_define_with_int_value ("__ARM_FEATURE_LDREX", TARGET_ARM_FEATURE_LDREX); diff --git a/gcc/config/arm/arm-flags.h b/gcc/config/arm/arm-flags.h index 7ce059b..fb49838 100644 --- a/gcc/config/arm/arm-flags.h +++ b/gcc/config/arm/arm-flags.h @@ -70,6 +70,7 @@ #define FL2_ARCH8_2 (1U << 1) /* Architecture 8.2. */ #define FL2_FP16INST (1U << 2) /* FP16 Instructions for ARMv8.2 and later. */ +#define FL2_CMSE (1U << 3) /* ARMv8-M Security Extensions. */ /* Flags that only effect tuning, not available instructions. */ #define FL_TUNE (FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \ diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 74cb64c..0d7d38a 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -909,6 +909,9 @@ int arm_condexec_masklen = 0; /* Nonzero if chip supports the ARMv8 CRC instructions. */ int arm_arch_crc = 0; +/* Nonzero if chip supports the ARMv8-M security extensions. */ +int arm_arch_cmse = 0; + /* Nonzero if the core has a very small, high-latency, multiply unit. */ int arm_m_profile_small_mul = 0; @@ -3227,6 +3230,7 @@ arm_option_override (void) arm_arch_no_volatile_ce = ARM_FSET_HAS_CPU1 (insn_flags, FL_NO_VOLATILE_CE); arm_tune_cortex_a9 = (arm_tune == TARGET_CPU_cortexa9) != 0; arm_arch_crc = ARM_FSET_HAS_CPU1 (insn_flags, FL_CRC32); + arm_arch_cmse = ARM_FSET_HAS_CPU2 (insn_flags, FL2_CMSE); arm_m_profile_small_mul = ARM_FSET_HAS_CPU1 (insn_flags, FL_SMALLMUL); arm_fp16_inst = ARM_FSET_HAS_CPU2 (insn_flags, FL2_FP16INST); if (arm_fp16_inst) @@ -3494,6 +3498,9 @@ arm_option_override (void) if (target_slow_flash_data || target_pure_code) arm_disable_literal_pool = true; + if (use_cmse && !arm_arch_cmse) + error ("target CPU does not support ARMv8-M Security Extensions"); + /* Disable scheduling fusion by default if it's not armv7 processor or doesn't prefer ldrd/strd. */ if (flag_schedule_fusion == 2 diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 464710b..3d62743 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -523,6 +523,9 @@ extern bool arm_disable_literal_pool; /* Nonzero if chip supports the ARMv8 CRC instructions. */ extern int arm_arch_crc; +/* Nonzero if chip supports the ARMv8-M Security Extensions. */ +extern int arm_arch_cmse; + #ifndef TARGET_DEFAULT #define TARGET_DEFAULT (MASK_APCS_FRAME) #endif diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt index 8856976..a37facc 100644 --- a/gcc/config/arm/arm.opt +++ b/gcc/config/arm/arm.opt @@ -105,6 +105,10 @@ mfloat-abi= Target RejectNegative Joined Enum(float_abi_type) Var(arm_float_abi) Init(TARGET_DEFAULT_FLOAT_ABI) Specify if floating point hardware should be used. +mcmse +Target RejectNegative Var(use_cmse) +Specify that the compiler should target secure code as per ARMv8-M Security Extensions. + Enum Name(float_abi_type) Type(enum float_abi_type) Known floating-point ABIs (for use with the -mfloat-abi= option): diff --git a/gcc/config/arm/arm_cmse.h b/gcc/config/arm/arm_cmse.h new file mode 100644 index 0000000..894343b --- /dev/null +++ b/gcc/config/arm/arm_cmse.h @@ -0,0 +1,192 @@ +/* ARMv8-M Secure Extensions intrinsics include file. + + Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by ARM Ltd. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + + +#ifndef _GCC_ARM_CMSE_H +#define _GCC_ARM_CMSE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#if __ARM_FEATURE_CMSE & 1 + +#include +#include + +#ifdef __ARM_BIG_ENDIAN + +typedef union { + struct cmse_address_info { +#if __ARM_FEATURE_CMSE & 2 + unsigned idau_region:8; + unsigned idau_region_valid:1; + unsigned secure:1; + unsigned nonsecure_readwrite_ok:1; + unsigned nonsecure_read_ok:1; +#else + unsigned :12; +#endif + unsigned readwrite_ok:1; + unsigned read_ok:1; +#if __ARM_FEATURE_CMSE & 2 + unsigned sau_region_valid:1; +#else + unsigned :1; +#endif + unsigned mpu_region_valid:1; +#if __ARM_FEATURE_CMSE & 2 + unsigned sau_region:8; +#else + unsigned :8; +#endif + unsigned mpu_region:8; + } flags; + unsigned value; +} cmse_address_info_t; + +#else + +typedef union { + struct cmse_address_info { + unsigned mpu_region:8; +#if __ARM_FEATURE_CMSE & 2 + unsigned sau_region:8; +#else + unsigned :8; +#endif + unsigned mpu_region_valid:1; +#if __ARM_FEATURE_CMSE & 2 + unsigned sau_region_valid:1; +#else + unsigned :1; +#endif + unsigned read_ok:1; + unsigned readwrite_ok:1; +#if __ARM_FEATURE_CMSE & 2 + unsigned nonsecure_read_ok:1; + unsigned nonsecure_readwrite_ok:1; + unsigned secure:1; + unsigned idau_region_valid:1; + unsigned idau_region:8; +#else + unsigned :12; +#endif + } flags; + unsigned value; +} cmse_address_info_t; + +#endif /* __ARM_BIG_ENDIAN */ + +#define cmse_TT_fptr(p) (__cmse_TT_fptr ((__cmse_fptr)(p))) + +typedef void (*__cmse_fptr)(void); + +#define __CMSE_TT_ASM(flags) \ +{ \ + cmse_address_info_t __result; \ + __asm__ ("tt" # flags " %0,%1" \ + : "=r"(__result) \ + : "r"(__p) \ + : "memory"); \ + return __result; \ +} + +__extension__ static __inline __attribute__ ((__always_inline__)) +cmse_address_info_t +__cmse_TT_fptr (__cmse_fptr __p) +__CMSE_TT_ASM () + +__extension__ static __inline __attribute__ ((__always_inline__)) +cmse_address_info_t +cmse_TT (void *__p) +__CMSE_TT_ASM () + +#define cmse_TTT_fptr(p) (__cmse_TTT_fptr ((__cmse_fptr)(p))) + +__extension__ static __inline __attribute__ ((__always_inline__)) +cmse_address_info_t +__cmse_TTT_fptr (__cmse_fptr __p) +__CMSE_TT_ASM (t) + +__extension__ static __inline __attribute__ ((__always_inline__)) +cmse_address_info_t +cmse_TTT (void *__p) +__CMSE_TT_ASM (t) + +#if __ARM_FEATURE_CMSE & 2 + +#define cmse_TTA_fptr(p) (__cmse_TTA_fptr ((__cmse_fptr)(p))) + +__extension__ static __inline __attribute__ ((__always_inline__)) +cmse_address_info_t +__cmse_TTA_fptr (__cmse_fptr __p) +__CMSE_TT_ASM (a) + +__extension__ static __inline __attribute__ ((__always_inline__)) +cmse_address_info_t +cmse_TTA (void *__p) +__CMSE_TT_ASM (a) + +#define cmse_TTAT_fptr(p) (__cmse_TTAT_fptr ((__cmse_fptr)(p))) + +__extension__ static __inline cmse_address_info_t +__attribute__ ((__always_inline__)) +__cmse_TTAT_fptr (__cmse_fptr __p) +__CMSE_TT_ASM (at) + +__extension__ static __inline cmse_address_info_t +__attribute__ ((__always_inline__)) +cmse_TTAT (void *__p) +__CMSE_TT_ASM (at) + +#define CMSE_AU_NONSECURE 2 +#define CMSE_MPU_NONSECURE 16 +#define CMSE_NONSECURE 18 + +#define cmse_nsfptr_create(p) ((typeof ((p))) ((intptr_t) (p) & ~1)) + +#define cmse_is_nsfptr(p) (!((intptr_t) (p) & 1)) + +#endif /* __ARM_FEATURE_CMSE & 2 */ + +#define CMSE_MPU_UNPRIV 4 +#define CMSE_MPU_READWRITE 1 +#define CMSE_MPU_READ 8 + +__extension__ void * +cmse_check_address_range (void *, size_t, int); + +#define cmse_check_pointed_object(p, f) \ + ((typeof ((p))) cmse_check_address_range ((p), sizeof (*(p)), (f))) + +#endif /* __ARM_FEATURE_CMSE & 1 */ + +#ifdef __cplusplus +} +#endif + +#endif /* _GCC_ARM_CMSE_H */ diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index c40e289..6c7fff2 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -11794,6 +11794,7 @@ instructions, but allow the compiler to schedule those calls. * ARM iWMMXt Built-in Functions:: * ARM C Language Extensions (ACLE):: * ARM Floating Point Status and Control Intrinsics:: +* ARM ARMv8-M Security Extensions:: * AVR Built-in Functions:: * Blackfin Built-in Functions:: * FR-V Built-in Functions:: @@ -12639,6 +12640,31 @@ unsigned int __builtin_arm_get_fpscr () void __builtin_arm_set_fpscr (unsigned int) @end smallexample +@node ARM ARMv8-M Security Extensions +@subsection ARM ARMv8-M Security Extensions + +GCC implements the ARMv8-M Security Extensions as described in the ARMv8-M +Security Extensions: Requiremenets on Development Tools Engineering +Specification, which can be found at +@uref{http://infocenter.arm.com/help/topic/com.arm.doc.ecm0359818/ECM0359818_armv8m_security_extensions_reqs_on_dev_tools_1_0.pdf}. + +As part of the Security Extensions GCC implements the intrinsics below. FPTR +is used here to mean any function pointer type. + +@smallexample +cmse_address_info_t cmse_TT (void *) +cmse_address_info_t cmse_TT_fptr (FPTR) +cmse_address_info_t cmse_TTT (void *) +cmse_address_info_t cmse_TTT_fptr (FPTR) +cmse_address_info_t cmse_TTA (void *) +cmse_address_info_t cmse_TTA_fptr (FPTR) +cmse_address_info_t cmse_TTAT (void *) +cmse_address_info_t cmse_TTAT_fptr (FPTR) +void * cmse_check_address_range (void *, size_t, int) +typeof(p) cmse_nsfptr_create (FPTR p) +intptr_t cmse_is_nsfptr (FPTR) +@end smallexample + @node AVR Built-in Functions @subsection AVR Built-in Functions diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index fd549ec..034ae98 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -649,7 +649,8 @@ Objective-C and Objective-C++ Dialects}. -mslow-flash-data @gol -masm-syntax-unified @gol -mrestrict-it @gol --mpure-code} +-mpure-code @gol +-mcmse} @emph{AVR Options} @gccoptlist{-mmcu=@var{mcu} -mabsdata -maccumulate-args @gol @@ -15378,6 +15379,11 @@ Additionally, when compiling for ELF object format give all text sections the ELF processor-specific section attribute @code{SHF_ARM_PURECODE}. This option is only available when generating non-pic code for ARMv7-M targets. +@item -mcmse +@opindex mcmse +Generate secure code as per the "ARMv8-M Security Extensions: Requirements on +Development Tools Engineering Specification", which can be found on +@url{http://infocenter.arm.com/help/topic/com.arm.doc.ecm0359818/ECM0359818_armv8m_security_extensions_reqs_on_dev_tools_1_0.pdf}. @end table @node AVR Options diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index fdda301..e7fdd2d 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -1674,6 +1674,10 @@ ARM target generates Thumb-1 code for @code{-mthumb} with ARM target for which divmod transform is disabled, if it supports hardware div instruction. +@item arm_cmse_ok +ARM target supports ARMv8-M Security Extensions, enabled by the @code{-mcmse} +option. + @end table @subsubsection AArch64-specific attributes diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index f1b5c35..1e3d651 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,12 @@ +2016-12-02 Andre Vieira + Thomas Preud'homme + + * gcc.target/arm/cmse/cmse.exp: New. + * gcc.target/arm/cmse/cmse-1.c: New. + * gcc.target/arm/cmse/cmse-12.c: New. + * lib/target-supports.exp + (check_effective_target_arm_cmse_ok): New. + 2016-12-02 Cesar Philippidis James Norris diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-1.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-1.c new file mode 100644 index 0000000..d5b9a2d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-1.c @@ -0,0 +1,67 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -mcmse -fdump-rtl-expand" } */ + +#include + +extern int a; +extern int bar (void); + +int foo (char * p) +{ + cmse_address_info_t cait; + + cait = cmse_TT (&a); + if (cait.flags.mpu_region) + a++; + + cait = cmse_TT_fptr (&bar); + if (cait.flags.mpu_region) + a+= bar (); + + cait = cmse_TTA (&a); + if (cait.flags.mpu_region) + a++; + + cait = cmse_TTA_fptr (&bar); + if (cait.flags.mpu_region) + a+= bar (); + + cait = cmse_TTT (&a); + if (cait.flags.mpu_region) + a++; + + cait = cmse_TTT_fptr (&bar); + if (cait.flags.mpu_region) + a+= bar (); + + cait = cmse_TTAT (&a); + if (cait.flags.mpu_region) + a++; + + cait = cmse_TTAT_fptr (&bar); + if (cait.flags.mpu_region) + a+= bar (); + + p = (char *) cmse_check_address_range ((void *) p, sizeof (char), 0); + p = (char *) cmse_check_address_range ((void *) p, sizeof (char), + CMSE_MPU_UNPRIV); + p = (char *) cmse_check_address_range ((void *) p, sizeof (char), + CMSE_MPU_READWRITE); + p = (char *) cmse_check_address_range ((void *) p, sizeof (char), + CMSE_MPU_UNPRIV | CMSE_MPU_READ); + p = (char *) cmse_check_address_range ((void *) p, sizeof (char), + CMSE_AU_NONSECURE + | CMSE_MPU_NONSECURE); + p = (char *) cmse_check_address_range ((void *) p, sizeof (char), + CMSE_NONSECURE | CMSE_MPU_UNPRIV); + + p = (char *) cmse_check_pointed_object (p, CMSE_NONSECURE | CMSE_MPU_UNPRIV); + + return a; +} +/* { dg-final { scan-assembler-times "\ttt " 2 } } */ +/* { dg-final { scan-assembler-times "ttt " 2 } } */ +/* { dg-final { scan-assembler-times "tta " 2 } } */ +/* { dg-final { scan-assembler-times "ttat " 2 } } */ +/* { dg-final { scan-assembler-times "bl.cmse_check_address_range" 7 } } */ +/* { dg-final { scan-assembler-not "cmse_check_pointed_object" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-12.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-12.c new file mode 100644 index 0000000..87a2f13 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-12.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ +#include + +char * +foo (char * p) +{ + if (!cmse_is_nsfptr (p)) + return cmse_nsfptr_create (p); +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler-not "cmse_is_nsfptr" } } */ +/* { dg-final { scan-assembler-not "cmse_nsfptr_create" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse.exp b/gcc/testsuite/gcc.target/arm/cmse/cmse.exp new file mode 100644 index 0000000..f797dba --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse.exp @@ -0,0 +1,50 @@ +# Copyright (C) 1997-2016 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . + +# GCC testsuite for ARMv8-M Security Extensions using the `dg.exp' driver. + +# Load support procs. +load_lib gcc-dg.exp + +# Exit immediately if the target does not support -mcmse. +if ![check_effective_target_arm_cmse_ok] then { + return +} + +# If a testcase doesn't have special options, use these. +global DEFAULT_CFLAGS +if ![info exists DEFAULT_CFLAGS] then { + set DEFAULT_CFLAGS " -ansi -pedantic-errors" +} + +# Initialize `dg'. +dg-init + +set saved-dg-do-what-default ${dg-do-what-default} +set dg-do-what-default "assemble" + +set saved-lto_torture_options ${LTO_TORTURE_OPTIONS} +set LTO_TORTURE_OPTIONS "" + +# These are for both baseline and mainline. +gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] \ + "" $DEFAULT_CFLAGS + +set LTO_TORTURE_OPTIONS ${saved-lto_torture_options} +set dg-do-what-default ${saved-dg-do-what-default} + +# All done. +dg-finish diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 798cf6b..0fc0baf 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -3916,6 +3916,19 @@ proc check_effective_target_arm_thumb1_cbz_ok {} { } } +# Return 1 if this is an ARM target where ARMv8-M Security Extensions is +# available. + +proc check_effective_target_arm_cmse_ok {} { + return [check_no_compiler_messages arm_cmse object { + int + foo (void) + { + asm ("bxns r0"); + } + } "-mcmse"]; +} + # Return 1 if this compilation turns on string_ops_prefer_neon on. proc check_effective_target_arm_tune_string_ops_prefer_neon { } { diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog index 438021a..f323f43 100644 --- a/libgcc/ChangeLog +++ b/libgcc/ChangeLog @@ -1,3 +1,9 @@ +2016-12-02 Andre Vieira + Thomas Preud'homme + + * config/arm/t-arm (HAVE_CMSE): New. + * config/arm/cmse.c: New. + 2016-11-28 Thomas Petazzoni PR gcc/74748 diff --git a/libgcc/config/arm/cmse.c b/libgcc/config/arm/cmse.c new file mode 100644 index 0000000..fe3a229 --- /dev/null +++ b/libgcc/config/arm/cmse.c @@ -0,0 +1,108 @@ +/* ARMv8-M Security Extensions routines. + Copyright (C) 2015-2016 Free Software Foundation, Inc. + Contributed by ARM Ltd. + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + + +#if __ARM_FEATURE_CMSE & 1 + +#include + +/* ARM intrinsic function to perform a permission check on a given + address range. See ACLE changes for ARMv8-M. */ + +void * +cmse_check_address_range (void *p, size_t size, int flags) +{ + cmse_address_info_t permb, perme; + char *pb = (char *) p, *pe; + + /* Check if the range wraps around. */ + if (UINTPTR_MAX - (uintptr_t) p < size) + return NULL; + + /* Check if an unknown flag is present. */ + int known = CMSE_MPU_UNPRIV | CMSE_MPU_READWRITE | CMSE_MPU_READ; + int known_secure_level = CMSE_MPU_UNPRIV; +#if __ARM_FEATURE_CMSE & 2 + known |= CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE; + known_secure_level |= CMSE_MPU_NONSECURE; +#endif + if (flags & (~known)) + return NULL; + + /* Execute the right variant of the TT instructions. */ + pe = pb + size - 1; + const int singleCheck = (((uintptr_t) pb ^ (uintptr_t) pe) < 32); + switch (flags & known_secure_level) + { + case 0: + permb = cmse_TT (pb); + perme = singleCheck ? permb : cmse_TT (pe); + break; + case CMSE_MPU_UNPRIV: + permb = cmse_TTT (pb); + perme = singleCheck ? permb : cmse_TTT (pe); + break; +#if __ARM_FEATURE_CMSE & 2 + case CMSE_MPU_NONSECURE: + permb = cmse_TTA (pb); + perme = singleCheck ? permb : cmse_TTA (pe); + break; + case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE: + permb = cmse_TTAT (pb); + perme = singleCheck ? permb : cmse_TTAT (pe); + break; +#endif + default: + /* Invalid flag, eg. CMSE_MPU_NONSECURE specified but + __ARM_FEATURE_CMSE & 2 == 0. */ + return NULL; + } + + /* Check that the range does not cross MPU, SAU, or IDAU boundaries. */ + if (permb.value != perme.value) + return NULL; + + /* Check the permissions on the range. */ + switch (flags & (~known_secure_level)) + { +#if __ARM_FEATURE_CMSE & 2 + case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE: + case CMSE_MPU_READWRITE | CMSE_AU_NONSECURE: + return permb.flags.nonsecure_readwrite_ok ? p : NULL; + case CMSE_MPU_READ | CMSE_AU_NONSECURE: + return permb.flags.nonsecure_read_ok ? p : NULL; + case CMSE_AU_NONSECURE: + return permb.flags.secure ? NULL : p; +#endif + case CMSE_MPU_READ | CMSE_MPU_READWRITE: + case CMSE_MPU_READWRITE: + return permb.flags.readwrite_ok ? p : NULL; + case CMSE_MPU_READ: + return permb.flags.read_ok ? p : NULL; + default: + return NULL; + } +} + + +#endif /* __ARM_FEATURE_CMSE & 1. */ diff --git a/libgcc/config/arm/t-arm b/libgcc/config/arm/t-arm index 4e17e99..5618143 100644 --- a/libgcc/config/arm/t-arm +++ b/libgcc/config/arm/t-arm @@ -1,3 +1,15 @@ LIB1ASMSRC = arm/lib1funcs.S LIB1ASMFUNCS = _thumb1_case_sqi _thumb1_case_uqi _thumb1_case_shi \ _thumb1_case_uhi _thumb1_case_si + +HAVE_CMSE:=$(findstring __ARM_FEATURE_CMSE,$(shell $(gcc_compile_bare) -dM -E - /dev/null),) +CMSE_OPTS:=-mcmse +endif + +ifdef HAVE_CMSE +libgcc-objects += cmse.o cmse_nonsecure_call.o + +cmse.o: $(srcdir)/config/arm/cmse.c + $(gcc_compile) -c $(CMSE_OPTS) $< +endif -- cgit v1.1 From 97b0656d67d2d39a79556bd200d3b6c41b2e5d6f Mon Sep 17 00:00:00 2001 From: Andre Vieira Date: Fri, 2 Dec 2016 15:24:40 +0000 Subject: Handling ARMv8-M Security Extension's cmse_nonsecure_entry attribute gcc/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * config/arm/arm.c (arm_handle_cmse_nonsecure_entry): New. (arm_attribute_table): Added cmse_nonsecure_entry (arm_compute_func_type): Handle cmse_nonsecure_entry. (cmse_func_args_or_return_in_stack): New. (arm_handle_cmse_nonsecure_entry): New. * config/arm/arm.h (ARM_FT_CMSE_ENTRY): New macro define. (IS_CMSE_ENTRY): Likewise. * doc/extend.texi (ARM ARMv8-M Security Extensions): New attribute. gcc/testsuite/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * gcc.target/arm/cmse/cmse-3.c: New. Co-Authored-By: Thomas Preud'homme From-SVN: r243188 --- gcc/ChangeLog | 12 +++ gcc/config/arm/arm.c | 114 +++++++++++++++++++++++++++++ gcc/config/arm/arm.h | 2 + gcc/doc/extend.texi | 3 + gcc/testsuite/ChangeLog | 5 ++ gcc/testsuite/gcc.target/arm/cmse/cmse-3.c | 37 ++++++++++ 6 files changed, 173 insertions(+) create mode 100644 gcc/testsuite/gcc.target/arm/cmse/cmse-3.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f24e99e..7eb56d2 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,6 +1,18 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * config/arm/arm.c (arm_handle_cmse_nonsecure_entry): New. + (arm_attribute_table): Added cmse_nonsecure_entry + (arm_compute_func_type): Handle cmse_nonsecure_entry. + (cmse_func_args_or_return_in_stack): New. + (arm_handle_cmse_nonsecure_entry): New. + * config/arm/arm.h (ARM_FT_CMSE_ENTRY): New macro define. + (IS_CMSE_ENTRY): Likewise. + * doc/extend.texi (ARM ARMv8-M Security Extensions): New attribute. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * config.gcc (extra_headers): Added arm_cmse.h. * config/arm/arm-arches.def (ARM_ARCH): (armv8-m): Add FL2_CMSE. diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 0d7d38a..7761564 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -136,6 +136,7 @@ static tree arm_handle_isr_attribute (tree *, tree, tree, int, bool *); #if TARGET_DLLIMPORT_DECL_ATTRIBUTES static tree arm_handle_notshared_attribute (tree *, tree, tree, int, bool *); #endif +static tree arm_handle_cmse_nonsecure_entry (tree *, tree, tree, int, bool *); static void arm_output_function_epilogue (FILE *, HOST_WIDE_INT); static void arm_output_function_prologue (FILE *, HOST_WIDE_INT); static int arm_comp_type_attributes (const_tree, const_tree); @@ -344,6 +345,9 @@ static const struct attribute_spec arm_attribute_table[] = { "notshared", 0, 0, false, true, false, arm_handle_notshared_attribute, false }, #endif + /* ARMv8-M Security Extensions support. */ + { "cmse_nonsecure_entry", 0, 0, true, false, false, + arm_handle_cmse_nonsecure_entry, false }, { NULL, 0, 0, false, false, false, NULL, false } }; @@ -3633,6 +3637,9 @@ arm_compute_func_type (void) else type |= arm_isr_value (TREE_VALUE (a)); + if (lookup_attribute ("cmse_nonsecure_entry", attr)) + type |= ARM_FT_CMSE_ENTRY; + return type; } @@ -6634,6 +6641,113 @@ arm_handle_notshared_attribute (tree *node, } #endif +/* This function returns true if a function with declaration FNDECL and type + FNTYPE uses the stack to pass arguments or return variables and false + otherwise. This is used for functions with the attributes + 'cmse_nonsecure_call' or 'cmse_nonsecure_entry' and this function will issue + diagnostic messages if the stack is used. NAME is the name of the attribute + used. */ + +static bool +cmse_func_args_or_return_in_stack (tree fndecl, tree name, tree fntype) +{ + function_args_iterator args_iter; + CUMULATIVE_ARGS args_so_far_v; + cumulative_args_t args_so_far; + bool first_param = true; + tree arg_type, prev_arg_type = NULL_TREE, ret_type; + + /* Error out if any argument is passed on the stack. */ + arm_init_cumulative_args (&args_so_far_v, fntype, NULL_RTX, fndecl); + args_so_far = pack_cumulative_args (&args_so_far_v); + FOREACH_FUNCTION_ARGS (fntype, arg_type, args_iter) + { + rtx arg_rtx; + machine_mode arg_mode = TYPE_MODE (arg_type); + + prev_arg_type = arg_type; + if (VOID_TYPE_P (arg_type)) + continue; + + if (!first_param) + arm_function_arg_advance (args_so_far, arg_mode, arg_type, true); + arg_rtx = arm_function_arg (args_so_far, arg_mode, arg_type, true); + if (!arg_rtx + || arm_arg_partial_bytes (args_so_far, arg_mode, arg_type, true)) + { + error ("%qE attribute not available to functions with arguments " + "passed on the stack", name); + return true; + } + first_param = false; + } + + /* Error out for variadic functions since we cannot control how many + arguments will be passed and thus stack could be used. stdarg_p () is not + used for the checking to avoid browsing arguments twice. */ + if (prev_arg_type != NULL_TREE && !VOID_TYPE_P (prev_arg_type)) + { + error ("%qE attribute not available to functions with variable number " + "of arguments", name); + return true; + } + + /* Error out if return value is passed on the stack. */ + ret_type = TREE_TYPE (fntype); + if (arm_return_in_memory (ret_type, fntype)) + { + error ("%qE attribute not available to functions that return value on " + "the stack", name); + return true; + } + return false; +} + +/* Called upon detection of the use of the cmse_nonsecure_entry attribute, this + function will check whether the attribute is allowed here and will add the + attribute to the function declaration tree or otherwise issue a warning. */ + +static tree +arm_handle_cmse_nonsecure_entry (tree *node, tree name, + tree /* args */, + int /* flags */, + bool *no_add_attrs) +{ + tree fndecl; + + if (!use_cmse) + { + *no_add_attrs = true; + warning (OPT_Wattributes, "%qE attribute ignored without -mcmse option.", + name); + return NULL_TREE; + } + + /* Ignore attribute for function types. */ + if (TREE_CODE (*node) != FUNCTION_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + return NULL_TREE; + } + + fndecl = *node; + + /* Warn for static linkage functions. */ + if (!TREE_PUBLIC (fndecl)) + { + warning (OPT_Wattributes, "%qE attribute has no effect on functions " + "with static linkage", name); + *no_add_attrs = true; + return NULL_TREE; + } + + *no_add_attrs |= cmse_func_args_or_return_in_stack (fndecl, name, + TREE_TYPE (fndecl)); + return NULL_TREE; +} + /* Return 0 if the attributes for two types are incompatible, 1 if they are compatible, and 2 if they are nearly compatible (which causes a warning to be generated). */ diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 3d62743..928fad4 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -1385,6 +1385,7 @@ enum reg_class #define ARM_FT_VOLATILE (1 << 4) /* Does not return. */ #define ARM_FT_NESTED (1 << 5) /* Embedded inside another func. */ #define ARM_FT_STACKALIGN (1 << 6) /* Called with misaligned stack. */ +#define ARM_FT_CMSE_ENTRY (1 << 7) /* ARMv8-M non-secure entry function. */ /* Some macros to test these flags. */ #define ARM_FUNC_TYPE(t) (t & ARM_FT_TYPE_MASK) @@ -1393,6 +1394,7 @@ enum reg_class #define IS_NAKED(t) (t & ARM_FT_NAKED) #define IS_NESTED(t) (t & ARM_FT_NESTED) #define IS_STACKALIGN(t) (t & ARM_FT_STACKALIGN) +#define IS_CMSE_ENTRY(t) (t & ARM_FT_CMSE_ENTRY) /* Structure used to hold the function stack frame layout. Offsets are diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 6c7fff2..d72af3e 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -12648,6 +12648,9 @@ Security Extensions: Requiremenets on Development Tools Engineering Specification, which can be found at @uref{http://infocenter.arm.com/help/topic/com.arm.doc.ecm0359818/ECM0359818_armv8m_security_extensions_reqs_on_dev_tools_1_0.pdf}. +As part of the Security Extensions GCC implements a new function attribute +@code{cmse_nonsecure_entry}. + As part of the Security Extensions GCC implements the intrinsics below. FPTR is used here to mean any function pointer type. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 1e3d651..4e1240e 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,6 +1,11 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * gcc.target/arm/cmse/cmse-3.c: New. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * gcc.target/arm/cmse/cmse.exp: New. * gcc.target/arm/cmse/cmse-1.c: New. * gcc.target/arm/cmse/cmse-12.c: New. diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-3.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-3.c new file mode 100644 index 0000000..2c2920e --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-3.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +struct span { + int a, b; +}; +struct span2 { + float a, b, c, d; +}; + +union test_union +{ + long long a; + int b; + struct span2 c; +} test_union; + +void __attribute__ ((cmse_nonsecure_entry)) +foo (long long a, int b, long long c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */ + +void __attribute__ ((cmse_nonsecure_entry)) +bar (long long a, int b, struct span c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */ + +void __attribute__ ((cmse_nonsecure_entry)) +baz (int a, ...) {} /* { dg-error "not available to functions with variable number of arguments" } */ + +struct span __attribute__ ((cmse_nonsecure_entry)) +qux (void) { /* { dg-error "not available to functions that return value on the stack" } */ + struct span ret = {0, 0}; + return ret; +} + +void __attribute__ ((cmse_nonsecure_entry)) +norf (struct span2 a) {} + +void __attribute__ ((cmse_nonsecure_entry)) +foo2 (long long a, int b, union test_union c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */ -- cgit v1.1 From 9ad1f699b81ce32d1193301ee2c0c188abf64d28 Mon Sep 17 00:00:00 2001 From: Andre Vieira Date: Fri, 2 Dec 2016 15:27:03 +0000 Subject: ARMv8-M Security Extension's cmse_nonsecure_entry: __acle_se label and bxns return gcc/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * config/arm/arm.c (use_return_insn): Change to return with bxns when cmse_nonsecure_entry. (output_return_instruction): Likewise. (arm_output_function_prologue): Likewise. (thumb_pop): Likewise. (thumb_exit): Likewise. (thumb2_expand_return): Assert that entry functions always have simple returns. (arm_expand_epilogue): Handle entry functions. (arm_function_ok_for_sibcall): Disable sibcall for entry functions. (arm_asm_declare_function_name): New. * config/arm/arm-protos.h (arm_asm_declare_function_name): New. * config/arm/elf.h (ASM_DECLARE_FUNCTION_NAME): Redefine to use arm_asm_declare_function_name. gcc/testsuite/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * gcc.target/arm/cmse/cmse-4.c: New. * gcc.target/arm/cmse/cmse-9.c: New. * gcc.target/arm/cmse/cmse-10.c: New. Co-Authored-By: Thomas Preud'homme From-SVN: r243189 --- gcc/ChangeLog | 18 +++++++ gcc/config/arm/arm-protos.h | 1 + gcc/config/arm/arm.c | 83 +++++++++++++++++++++++++---- gcc/config/arm/elf.h | 11 +--- gcc/testsuite/ChangeLog | 7 +++ gcc/testsuite/gcc.target/arm/cmse/cmse-10.c | 9 ++++ gcc/testsuite/gcc.target/arm/cmse/cmse-4.c | 27 ++++++++++ gcc/testsuite/gcc.target/arm/cmse/cmse-9.c | 12 +++++ 8 files changed, 148 insertions(+), 20 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/cmse/cmse-10.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/cmse-4.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/cmse-9.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7eb56d2..eeb8c3c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,6 +1,24 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * config/arm/arm.c (use_return_insn): Change to return with bxns + when cmse_nonsecure_entry. + (output_return_instruction): Likewise. + (arm_output_function_prologue): Likewise. + (thumb_pop): Likewise. + (thumb_exit): Likewise. + (thumb2_expand_return): Assert that entry functions always have simple + returns. + (arm_expand_epilogue): Handle entry functions. + (arm_function_ok_for_sibcall): Disable sibcall for entry functions. + (arm_asm_declare_function_name): New. + * config/arm/arm-protos.h (arm_asm_declare_function_name): New. + * config/arm/elf.h (ASM_DECLARE_FUNCTION_NAME): Redefine to + use arm_asm_declare_function_name. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * config/arm/arm.c (arm_handle_cmse_nonsecure_entry): New. (arm_attribute_table): Added cmse_nonsecure_entry (arm_compute_func_type): Handle cmse_nonsecure_entry. diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 68e9bea..634a5de 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -33,6 +33,7 @@ extern int arm_volatile_func (void); extern void arm_expand_prologue (void); extern void arm_expand_epilogue (bool); extern void arm_declare_function_name (FILE *, const char *, tree); +extern void arm_asm_declare_function_name (FILE *, const char *, tree); extern void thumb2_expand_return (bool); extern const char *arm_strip_name_encoding (const char *); extern void arm_asm_output_labelref (FILE *, const char *); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 7761564..db7e0c8 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -3866,6 +3866,11 @@ use_return_insn (int iscond, rtx sibling) return 0; } + /* ARMv8-M nonsecure entry function need to use bxns to return and thus need + several instructions if anything needs to be popped. */ + if (saved_int_regs && IS_CMSE_ENTRY (func_type)) + return 0; + /* If there are saved registers but the LR isn't saved, then we need two instructions for the return. */ if (saved_int_regs && !(saved_int_regs & (1 << LR_REGNUM))) @@ -6906,6 +6911,11 @@ arm_function_ok_for_sibcall (tree decl, tree exp) if (IS_INTERRUPT (func_type)) return false; + /* ARMv8-M non-secure entry functions need to return with bxns which is only + generated for entry functions themselves. */ + if (IS_CMSE_ENTRY (arm_current_func_type ())) + return false; + if (!VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl)))) { /* Check that the return value locations are the same. For @@ -18568,6 +18578,7 @@ output_return_instruction (rtx operand, bool really_return, bool reverse, (e.g. interworking) then we can load the return address directly into the PC. Otherwise we must load it into LR. */ if (really_return + && !IS_CMSE_ENTRY (func_type) && (IS_INTERRUPT (func_type) || !TARGET_INTERWORK)) return_reg = reg_names[PC_REGNUM]; else @@ -18708,8 +18719,10 @@ output_return_instruction (rtx operand, bool really_return, bool reverse, break; default: + if (IS_CMSE_ENTRY (func_type)) + snprintf (instr, sizeof (instr), "bxns%s\t%%|lr", conditional); /* Use bx if it's available. */ - if (arm_arch5 || arm_arch4t) + else if (arm_arch5 || arm_arch4t) sprintf (instr, "bx%s\t%%|lr", conditional); else sprintf (instr, "mov%s\t%%|pc, %%|lr", conditional); @@ -18722,6 +18735,44 @@ output_return_instruction (rtx operand, bool really_return, bool reverse, return ""; } +/* Output in FILE asm statements needed to declare the NAME of the function + defined by its DECL node. */ + +void +arm_asm_declare_function_name (FILE *file, const char *name, tree decl) +{ + size_t cmse_name_len; + char *cmse_name = 0; + char cmse_prefix[] = "__acle_se_"; + + /* When compiling with ARMv8-M Security Extensions enabled, we should print an + extra function label for each function with the 'cmse_nonsecure_entry' + attribute. This extra function label should be prepended with + '__acle_se_', telling the linker that it needs to create secure gateway + veneers for this function. */ + if (use_cmse && lookup_attribute ("cmse_nonsecure_entry", + DECL_ATTRIBUTES (decl))) + { + cmse_name_len = sizeof (cmse_prefix) + strlen (name); + cmse_name = XALLOCAVEC (char, cmse_name_len); + snprintf (cmse_name, cmse_name_len, "%s%s", cmse_prefix, name); + targetm.asm_out.globalize_label (file, cmse_name); + + ARM_DECLARE_FUNCTION_NAME (file, cmse_name, decl); + ASM_OUTPUT_TYPE_DIRECTIVE (file, cmse_name, "function"); + } + + ARM_DECLARE_FUNCTION_NAME (file, name, decl); + ASM_OUTPUT_TYPE_DIRECTIVE (file, name, "function"); + ASM_DECLARE_RESULT (file, DECL_RESULT (decl)); + ASM_OUTPUT_LABEL (file, name); + + if (cmse_name) + ASM_OUTPUT_LABEL (file, cmse_name); + + ARM_OUTPUT_FN_UNWIND (file, TRUE); +} + /* Write the function name into the code section, directly preceding the function prologue. @@ -18771,10 +18822,6 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size) { unsigned long func_type; - /* ??? Do we want to print some of the below anyway? */ - if (TARGET_THUMB1) - return; - /* Sanity check. */ gcc_assert (!arm_ccfsm_state && !arm_target_insn); @@ -18809,6 +18856,8 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size) asm_fprintf (f, "\t%@ Nested: function declared inside another function.\n"); if (IS_STACKALIGN (func_type)) asm_fprintf (f, "\t%@ Stack Align: May be called with mis-aligned SP.\n"); + if (IS_CMSE_ENTRY (func_type)) + asm_fprintf (f, "\t%@ Non-secure entry function: called from non-secure code.\n"); asm_fprintf (f, "\t%@ args = %d, pretend = %d, frame = %wd\n", crtl->args.size, @@ -22915,8 +22964,8 @@ thumb_pop (FILE *f, unsigned long mask) if (mask & (1 << PC_REGNUM)) { /* Catch popping the PC. */ - if (TARGET_INTERWORK || TARGET_BACKTRACE - || crtl->calls_eh_return) + if (TARGET_INTERWORK || TARGET_BACKTRACE || crtl->calls_eh_return + || IS_CMSE_ENTRY (arm_current_func_type ())) { /* The PC is never poped directly, instead it is popped into r3 and then BX is used. */ @@ -22977,7 +23026,10 @@ thumb_exit (FILE *f, int reg_containing_return_addr) if (crtl->calls_eh_return) asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM); - asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr); + if (IS_CMSE_ENTRY (arm_current_func_type ())) + asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr); + else + asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr); return; } /* Otherwise if we are not supporting interworking and we have not created @@ -22986,7 +23038,8 @@ thumb_exit (FILE *f, int reg_containing_return_addr) else if (!TARGET_INTERWORK && !TARGET_BACKTRACE && !is_called_in_ARM_mode (current_function_decl) - && !crtl->calls_eh_return) + && !crtl->calls_eh_return + && !IS_CMSE_ENTRY (arm_current_func_type ())) { asm_fprintf (f, "\tpop\t{%r}\n", PC_REGNUM); return; @@ -23209,7 +23262,10 @@ thumb_exit (FILE *f, int reg_containing_return_addr) asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM); /* Return to caller. */ - asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr); + if (IS_CMSE_ENTRY (arm_current_func_type ())) + asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr); + else + asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr); } /* Scan INSN just before assembler is output for it. @@ -24095,6 +24151,12 @@ thumb2_expand_return (bool simple_return) if (!simple_return && saved_regs_mask) { + /* TODO: Verify that this path is never taken for cmse_nonsecure_entry + functions or adapt code to handle according to ACLE. This path should + not be reachable for cmse_nonsecure_entry functions though we prefer + to assert it for now to ensure that future code changes do not silently + change this behavior. */ + gcc_assert (!IS_CMSE_ENTRY (arm_current_func_type ())); if (num_regs == 1) { rtx par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2)); @@ -24512,6 +24574,7 @@ arm_expand_epilogue (bool really_return) if (ARM_FUNC_TYPE (func_type) != ARM_FT_INTERWORKED && (TARGET_ARM || ARM_FUNC_TYPE (func_type) == ARM_FT_NORMAL) + && !IS_CMSE_ENTRY (func_type) && !IS_STACKALIGN (func_type) && really_return && crtl->args.pretend_args_size == 0 diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h index bc4eb86..03931ee 100644 --- a/gcc/config/arm/elf.h +++ b/gcc/config/arm/elf.h @@ -75,16 +75,7 @@ /* We might need a ARM specific header to function declarations. */ #undef ASM_DECLARE_FUNCTION_NAME -#define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL) \ - do \ - { \ - ARM_DECLARE_FUNCTION_NAME (FILE, NAME, DECL); \ - ASM_OUTPUT_TYPE_DIRECTIVE (FILE, NAME, "function"); \ - ASM_DECLARE_RESULT (FILE, DECL_RESULT (DECL)); \ - ASM_OUTPUT_LABEL(FILE, NAME); \ - ARM_OUTPUT_FN_UNWIND (FILE, TRUE); \ - } \ - while (0) +#define ASM_DECLARE_FUNCTION_NAME arm_asm_declare_function_name /* We might need an ARM specific trailer for function declarations. */ #undef ASM_DECLARE_FUNCTION_SIZE diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 4e1240e..e31a1b3 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,6 +1,13 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * gcc.target/arm/cmse/cmse-4.c: New. + * gcc.target/arm/cmse/cmse-9.c: New. + * gcc.target/arm/cmse/cmse-10.c: New. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * gcc.target/arm/cmse/cmse-3.c: New. 2016-12-02 Andre Vieira diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-10.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-10.c new file mode 100644 index 0000000..1a91ac3 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-10.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +void +foo (void) {} + +/* { dg-final { scan-assembler-not "bxns" } } */ +/* { dg-final { scan-assembler "foo:" } } */ +/* { dg-final { scan-assembler-not "__acle_se_foo:" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-4.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-4.c new file mode 100644 index 0000000..6f930ab --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-4.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +struct span { + int a, b; +}; + +extern int qux (void); + +void __attribute__ ((cmse_nonsecure_entry)) +foo (void) {} + +static void __attribute__ ((cmse_nonsecure_entry)) +bar (void) {} /* { dg-warning "has no effect on functions with static linkage" } */ + +int __attribute__ ((cmse_nonsecure_entry)) +baz (void) +{ + return qux (); +} + +/* { dg-final { scan-assembler-times "bxns" 2 } } */ +/* { dg-final { scan-assembler "foo:" } } */ +/* { dg-final { scan-assembler "__acle_se_foo:" } } */ +/* { dg-final { scan-assembler-not "__acle_se_bar:" } } */ +/* { dg-final { scan-assembler "baz:" } } */ +/* { dg-final { scan-assembler "__acle_se_baz:" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-9.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-9.c new file mode 100644 index 0000000..1d97f0e --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-9.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-skip-if "Testing exclusion of -mcmse" { arm-*-* } { "-mcmse" } { "" } } */ + + +int __attribute__ ((cmse_nonsecure_entry)) +foo (int a) +{ /* { dg-warning "attribute ignored without -mcmse option" } */ + return a + 1; +} + +/* { dg-final { scan-assembler "foo:" } } */ +/* { dg-final { scan-assembler-not "__acle_se_foo:" } } */ -- cgit v1.1 From de954d6a5fb7dbb2c4c0a646a5e59727b06847c1 Mon Sep 17 00:00:00 2001 From: Andre Vieira Date: Fri, 2 Dec 2016 15:29:03 +0000 Subject: ARMv8-M Security Extension's cmse_nonsecure_entry: clear registers gcc/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * config/arm/arm.c (output_return_instruction): Clear registers. (thumb2_expand_return): Likewise. (thumb1_expand_epilogue): Likewise. (thumb_exit): Likewise. (arm_expand_epilogue): Likewise. (cmse_nonsecure_entry_clear_before_return): New. (comp_not_to_clear_mask_str_un): New. (compute_not_to_clear_mask): New. * config/arm/thumb1.md (*epilogue_insns): Change length attribute. * config/arm/thumb2.md (*thumb2_return): Disable for cmse_nonsecure_entry functions. (*thumb2_cmse_entry_return): Duplicate thumb2_return pattern for cmse_nonsecure_entry functions. gcc/testsuite/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * gcc.target/arm/cmse/cmse.exp: Test different multilibs separate. * gcc.target/arm/cmse/struct-1.c: New. * gcc.target/arm/cmse/bitfield-1.c: New. * gcc.target/arm/cmse/bitfield-2.c: New. * gcc.target/arm/cmse/bitfield-3.c: New. * gcc.target/arm/cmse/baseline/cmse-2.c: New. * gcc.target/arm/cmse/baseline/softfp.c: New. * gcc.target/arm/cmse/mainline/soft/cmse-5.c: New. * gcc.target/arm/cmse/mainline/hard/cmse-5.c: New. * gcc.target/arm/cmse/mainline/hard-sp/cmse-5.c: New. * gcc.target/arm/cmse/mainline/softfp/cmse-5.c: New. * gcc.target/arm/cmse/mainline/softfp-sp/cmse-5.c: New. Co-Authored-By: Thomas Preud'homme From-SVN: r243190 --- gcc/ChangeLog | 18 + gcc/config/arm/arm.c | 486 ++++++++++++++++++++- gcc/config/arm/thumb1.md | 9 +- gcc/config/arm/thumb2.md | 21 +- gcc/testsuite/ChangeLog | 16 + .../gcc.target/arm/cmse/baseline/cmse-2.c | 19 + .../gcc.target/arm/cmse/baseline/softfp.c | 29 ++ gcc/testsuite/gcc.target/arm/cmse/bitfield-1.c | 39 ++ gcc/testsuite/gcc.target/arm/cmse/bitfield-2.c | 36 ++ gcc/testsuite/gcc.target/arm/cmse/bitfield-3.c | 37 ++ gcc/testsuite/gcc.target/arm/cmse/cmse.exp | 20 + .../gcc.target/arm/cmse/mainline/hard-sp/cmse-5.c | 45 ++ .../gcc.target/arm/cmse/mainline/hard/cmse-5.c | 38 ++ .../gcc.target/arm/cmse/mainline/soft/cmse-5.c | 24 + .../arm/cmse/mainline/softfp-sp/cmse-5.c | 46 ++ .../gcc.target/arm/cmse/mainline/softfp/cmse-5.c | 38 ++ gcc/testsuite/gcc.target/arm/cmse/struct-1.c | 33 ++ 17 files changed, 948 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-2.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/softfp.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/bitfield-1.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/bitfield-2.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/bitfield-3.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-5.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-5.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-5.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-5.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-5.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/struct-1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index eeb8c3c..bb1e524 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,6 +1,24 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * config/arm/arm.c (output_return_instruction): Clear + registers. + (thumb2_expand_return): Likewise. + (thumb1_expand_epilogue): Likewise. + (thumb_exit): Likewise. + (arm_expand_epilogue): Likewise. + (cmse_nonsecure_entry_clear_before_return): New. + (comp_not_to_clear_mask_str_un): New. + (compute_not_to_clear_mask): New. + * config/arm/thumb1.md (*epilogue_insns): Change length attribute. + * config/arm/thumb2.md (*thumb2_return): Disable for + cmse_nonsecure_entry functions. + (*thumb2_cmse_entry_return): Duplicate thumb2_return pattern for + cmse_nonsecure_entry functions. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * config/arm/arm.c (use_return_insn): Change to return with bxns when cmse_nonsecure_entry. (output_return_instruction): Likewise. diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index db7e0c8..6a9db85 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -16297,6 +16297,279 @@ note_invalid_constants (rtx_insn *insn, HOST_WIDE_INT address, int do_pushes) return; } +/* This function computes the clear mask and PADDING_BITS_TO_CLEAR for structs + and unions in the context of ARMv8-M Security Extensions. It is used as a + helper function for both 'cmse_nonsecure_call' and 'cmse_nonsecure_entry' + functions. The PADDING_BITS_TO_CLEAR pointer can be the base to either one + or four masks, depending on whether it is being computed for a + 'cmse_nonsecure_entry' return value or a 'cmse_nonsecure_call' argument + respectively. The tree for the type of the argument or a field within an + argument is passed in ARG_TYPE, the current register this argument or field + starts in is kept in the pointer REGNO and updated accordingly, the bit this + argument or field starts at is passed in STARTING_BIT and the last used bit + is kept in LAST_USED_BIT which is also updated accordingly. */ + +static unsigned HOST_WIDE_INT +comp_not_to_clear_mask_str_un (tree arg_type, int * regno, + uint32_t * padding_bits_to_clear, + unsigned starting_bit, int * last_used_bit) + +{ + unsigned HOST_WIDE_INT not_to_clear_reg_mask = 0; + + if (TREE_CODE (arg_type) == RECORD_TYPE) + { + unsigned current_bit = starting_bit; + tree field; + long int offset, size; + + + field = TYPE_FIELDS (arg_type); + while (field) + { + /* The offset within a structure is always an offset from + the start of that structure. Make sure we take that into the + calculation of the register based offset that we use here. */ + offset = starting_bit; + offset += TREE_INT_CST_ELT (DECL_FIELD_BIT_OFFSET (field), 0); + offset %= 32; + + /* This is the actual size of the field, for bitfields this is the + bitfield width and not the container size. */ + size = TREE_INT_CST_ELT (DECL_SIZE (field), 0); + + if (*last_used_bit != offset) + { + if (offset < *last_used_bit) + { + /* This field's offset is before the 'last_used_bit', that + means this field goes on the next register. So we need to + pad the rest of the current register and increase the + register number. */ + uint32_t mask; + mask = ((uint32_t)-1) - ((uint32_t) 1 << *last_used_bit); + mask++; + + padding_bits_to_clear[*regno] |= mask; + not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno; + (*regno)++; + } + else + { + /* Otherwise we pad the bits between the last field's end and + the start of the new field. */ + uint32_t mask; + + mask = ((uint32_t)-1) >> (32 - offset); + mask -= ((uint32_t) 1 << *last_used_bit) - 1; + padding_bits_to_clear[*regno] |= mask; + } + current_bit = offset; + } + + /* Calculate further padding bits for inner structs/unions too. */ + if (RECORD_OR_UNION_TYPE_P (TREE_TYPE (field))) + { + *last_used_bit = current_bit; + not_to_clear_reg_mask + |= comp_not_to_clear_mask_str_un (TREE_TYPE (field), regno, + padding_bits_to_clear, offset, + last_used_bit); + } + else + { + /* Update 'current_bit' with this field's size. If the + 'current_bit' lies in a subsequent register, update 'regno' and + reset 'current_bit' to point to the current bit in that new + register. */ + current_bit += size; + while (current_bit >= 32) + { + current_bit-=32; + not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno; + (*regno)++; + } + *last_used_bit = current_bit; + } + + field = TREE_CHAIN (field); + } + not_to_clear_reg_mask |= HOST_WIDE_INT_1U << *regno; + } + else if (TREE_CODE (arg_type) == UNION_TYPE) + { + tree field, field_t; + int i, regno_t, field_size; + int max_reg = -1; + int max_bit = -1; + uint32_t mask; + uint32_t padding_bits_to_clear_res[NUM_ARG_REGS] + = {-1, -1, -1, -1}; + + /* To compute the padding bits in a union we only consider bits as + padding bits if they are always either a padding bit or fall outside a + fields size for all fields in the union. */ + field = TYPE_FIELDS (arg_type); + while (field) + { + uint32_t padding_bits_to_clear_t[NUM_ARG_REGS] + = {0U, 0U, 0U, 0U}; + int last_used_bit_t = *last_used_bit; + regno_t = *regno; + field_t = TREE_TYPE (field); + + /* If the field's type is either a record or a union make sure to + compute their padding bits too. */ + if (RECORD_OR_UNION_TYPE_P (field_t)) + not_to_clear_reg_mask + |= comp_not_to_clear_mask_str_un (field_t, ®no_t, + &padding_bits_to_clear_t[0], + starting_bit, &last_used_bit_t); + else + { + field_size = TREE_INT_CST_ELT (DECL_SIZE (field), 0); + regno_t = (field_size / 32) + *regno; + last_used_bit_t = (starting_bit + field_size) % 32; + } + + for (i = *regno; i < regno_t; i++) + { + /* For all but the last register used by this field only keep the + padding bits that were padding bits in this field. */ + padding_bits_to_clear_res[i] &= padding_bits_to_clear_t[i]; + } + + /* For the last register, keep all padding bits that were padding + bits in this field and any padding bits that are still valid + as padding bits but fall outside of this field's size. */ + mask = (((uint32_t) -1) - ((uint32_t) 1 << last_used_bit_t)) + 1; + padding_bits_to_clear_res[regno_t] + &= padding_bits_to_clear_t[regno_t] | mask; + + /* Update the maximum size of the fields in terms of registers used + ('max_reg') and the 'last_used_bit' in said register. */ + if (max_reg < regno_t) + { + max_reg = regno_t; + max_bit = last_used_bit_t; + } + else if (max_reg == regno_t && max_bit < last_used_bit_t) + max_bit = last_used_bit_t; + + field = TREE_CHAIN (field); + } + + /* Update the current padding_bits_to_clear using the intersection of the + padding bits of all the fields. */ + for (i=*regno; i < max_reg; i++) + padding_bits_to_clear[i] |= padding_bits_to_clear_res[i]; + + /* Do not keep trailing padding bits, we do not know yet whether this + is the end of the argument. */ + mask = ((uint32_t) 1 << max_bit) - 1; + padding_bits_to_clear[max_reg] + |= padding_bits_to_clear_res[max_reg] & mask; + + *regno = max_reg; + *last_used_bit = max_bit; + } + else + /* This function should only be used for structs and unions. */ + gcc_unreachable (); + + return not_to_clear_reg_mask; +} + +/* In the context of ARMv8-M Security Extensions, this function is used for both + 'cmse_nonsecure_call' and 'cmse_nonsecure_entry' functions to compute what + registers are used when returning or passing arguments, which is then + returned as a mask. It will also compute a mask to indicate padding/unused + bits for each of these registers, and passes this through the + PADDING_BITS_TO_CLEAR pointer. The tree of the argument type is passed in + ARG_TYPE, the rtl representation of the argument is passed in ARG_RTX and + the starting register used to pass this argument or return value is passed + in REGNO. It makes use of 'comp_not_to_clear_mask_str_un' to compute these + for struct and union types. */ + +static unsigned HOST_WIDE_INT +compute_not_to_clear_mask (tree arg_type, rtx arg_rtx, int regno, + uint32_t * padding_bits_to_clear) + +{ + int last_used_bit = 0; + unsigned HOST_WIDE_INT not_to_clear_mask; + + if (RECORD_OR_UNION_TYPE_P (arg_type)) + { + not_to_clear_mask + = comp_not_to_clear_mask_str_un (arg_type, ®no, + padding_bits_to_clear, 0, + &last_used_bit); + + + /* If the 'last_used_bit' is not zero, that means we are still using a + part of the last 'regno'. In such cases we must clear the trailing + bits. Otherwise we are not using regno and we should mark it as to + clear. */ + if (last_used_bit != 0) + padding_bits_to_clear[regno] + |= ((uint32_t)-1) - ((uint32_t) 1 << last_used_bit) + 1; + else + not_to_clear_mask &= ~(HOST_WIDE_INT_1U << regno); + } + else + { + not_to_clear_mask = 0; + /* We are not dealing with structs nor unions. So these arguments may be + passed in floating point registers too. In some cases a BLKmode is + used when returning or passing arguments in multiple VFP registers. */ + if (GET_MODE (arg_rtx) == BLKmode) + { + int i, arg_regs; + rtx reg; + + /* This should really only occur when dealing with the hard-float + ABI. */ + gcc_assert (TARGET_HARD_FLOAT_ABI); + + for (i = 0; i < XVECLEN (arg_rtx, 0); i++) + { + reg = XEXP (XVECEXP (arg_rtx, 0, i), 0); + gcc_assert (REG_P (reg)); + + not_to_clear_mask |= HOST_WIDE_INT_1U << REGNO (reg); + + /* If we are dealing with DF mode, make sure we don't + clear either of the registers it addresses. */ + arg_regs = ARM_NUM_REGS (GET_MODE (reg)); + if (arg_regs > 1) + { + unsigned HOST_WIDE_INT mask; + mask = HOST_WIDE_INT_1U << (REGNO (reg) + arg_regs); + mask -= HOST_WIDE_INT_1U << REGNO (reg); + not_to_clear_mask |= mask; + } + } + } + else + { + /* Otherwise we can rely on the MODE to determine how many registers + are being used by this argument. */ + int arg_regs = ARM_NUM_REGS (GET_MODE (arg_rtx)); + not_to_clear_mask |= HOST_WIDE_INT_1U << REGNO (arg_rtx); + if (arg_regs > 1) + { + unsigned HOST_WIDE_INT + mask = HOST_WIDE_INT_1U << (REGNO (arg_rtx) + arg_regs); + mask -= HOST_WIDE_INT_1U << REGNO (arg_rtx); + not_to_clear_mask |= mask; + } + } + } + + return not_to_clear_mask; +} + /* Rewrite move insn into subtract of 0 if the condition codes will be useful in next conditional jump insn. */ @@ -18720,7 +18993,42 @@ output_return_instruction (rtx operand, bool really_return, bool reverse, default: if (IS_CMSE_ENTRY (func_type)) - snprintf (instr, sizeof (instr), "bxns%s\t%%|lr", conditional); + { + /* Check if we have to clear the 'GE bits' which is only used if + parallel add and subtraction instructions are available. */ + if (TARGET_INT_SIMD) + snprintf (instr, sizeof (instr), + "msr%s\tAPSR_nzcvqg, %%|lr", conditional); + else + snprintf (instr, sizeof (instr), + "msr%s\tAPSR_nzcvq, %%|lr", conditional); + + output_asm_insn (instr, & operand); + if (TARGET_HARD_FLOAT && !TARGET_THUMB1) + { + /* Clear the cumulative exception-status bits (0-4,7) and the + condition code bits (28-31) of the FPSCR. We need to + remember to clear the first scratch register used (IP) and + save and restore the second (r4). */ + snprintf (instr, sizeof (instr), "push\t{%%|r4}"); + output_asm_insn (instr, & operand); + snprintf (instr, sizeof (instr), "vmrs\t%%|ip, fpscr"); + output_asm_insn (instr, & operand); + snprintf (instr, sizeof (instr), "movw\t%%|r4, #65376"); + output_asm_insn (instr, & operand); + snprintf (instr, sizeof (instr), "movt\t%%|r4, #4095"); + output_asm_insn (instr, & operand); + snprintf (instr, sizeof (instr), "and\t%%|ip, %%|r4"); + output_asm_insn (instr, & operand); + snprintf (instr, sizeof (instr), "vmsr\tfpscr, %%|ip"); + output_asm_insn (instr, & operand); + snprintf (instr, sizeof (instr), "pop\t{%%|r4}"); + output_asm_insn (instr, & operand); + snprintf (instr, sizeof (instr), "mov\t%%|ip, %%|lr"); + output_asm_insn (instr, & operand); + } + snprintf (instr, sizeof (instr), "bxns\t%%|lr"); + } /* Use bx if it's available. */ else if (arm_arch5 || arm_arch4t) sprintf (instr, "bx%s\t%%|lr", conditional); @@ -23027,7 +23335,11 @@ thumb_exit (FILE *f, int reg_containing_return_addr) asm_fprintf (f, "\tadd\t%r, %r\n", SP_REGNUM, ARM_EH_STACKADJ_REGNUM); if (IS_CMSE_ENTRY (arm_current_func_type ())) - asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr); + { + asm_fprintf (f, "\tmsr\tAPSR_nzcvq, %r\n", + reg_containing_return_addr); + asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr); + } else asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr); return; @@ -23263,7 +23575,18 @@ thumb_exit (FILE *f, int reg_containing_return_addr) /* Return to caller. */ if (IS_CMSE_ENTRY (arm_current_func_type ())) - asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr); + { + /* This is for the cases where LR is not being used to contain the return + address. It may therefore contain information that we might not want + to leak, hence it must be cleared. The value in R0 will never be a + secret at this point, so it is safe to use it, see the clearing code + in 'cmse_nonsecure_entry_clear_before_return'. */ + if (reg_containing_return_addr != LR_REGNUM) + asm_fprintf (f, "\tmov\tlr, r0\n"); + + asm_fprintf (f, "\tmsr\tAPSR_nzcvq, %r\n", reg_containing_return_addr); + asm_fprintf (f, "\tbxns\t%r\n", reg_containing_return_addr); + } else asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr); } @@ -24130,6 +24453,149 @@ thumb1_expand_prologue (void) cfun->machine->lr_save_eliminated = 0; } +/* Clear caller saved registers not used to pass return values and leaked + condition flags before exiting a cmse_nonsecure_entry function. */ + +void +cmse_nonsecure_entry_clear_before_return (void) +{ + uint64_t to_clear_mask[2]; + uint32_t padding_bits_to_clear = 0; + uint32_t * padding_bits_to_clear_ptr = &padding_bits_to_clear; + int regno, maxregno = IP_REGNUM; + tree result_type; + rtx result_rtl; + + to_clear_mask[0] = (1ULL << (NUM_ARG_REGS)) - 1; + to_clear_mask[0] |= (1ULL << IP_REGNUM); + + /* If we are not dealing with -mfloat-abi=soft we will need to clear VFP + registers. We also check that TARGET_HARD_FLOAT and !TARGET_THUMB1 hold + to make sure the instructions used to clear them are present. */ + if (TARGET_HARD_FLOAT && !TARGET_THUMB1) + { + uint64_t float_mask = (1ULL << (D7_VFP_REGNUM + 1)) - 1; + maxregno = LAST_VFP_REGNUM; + + float_mask &= ~((1ULL << FIRST_VFP_REGNUM) - 1); + to_clear_mask[0] |= float_mask; + + float_mask = (1ULL << (maxregno - 63)) - 1; + to_clear_mask[1] = float_mask; + + /* Make sure we don't clear the two scratch registers used to clear the + relevant FPSCR bits in output_return_instruction. */ + emit_use (gen_rtx_REG (SImode, IP_REGNUM)); + to_clear_mask[0] &= ~(1ULL << IP_REGNUM); + emit_use (gen_rtx_REG (SImode, 4)); + to_clear_mask[0] &= ~(1ULL << 4); + } + + /* If the user has defined registers to be caller saved, these are no longer + restored by the function before returning and must thus be cleared for + security purposes. */ + for (regno = NUM_ARG_REGS; regno < LAST_VFP_REGNUM; regno++) + { + /* We do not touch registers that can be used to pass arguments as per + the AAPCS, since these should never be made callee-saved by user + options. */ + if (IN_RANGE (regno, FIRST_VFP_REGNUM, D7_VFP_REGNUM)) + continue; + if (IN_RANGE (regno, IP_REGNUM, PC_REGNUM)) + continue; + if (call_used_regs[regno]) + to_clear_mask[regno / 64] |= (1ULL << (regno % 64)); + } + + /* Make sure we do not clear the registers used to return the result in. */ + result_type = TREE_TYPE (DECL_RESULT (current_function_decl)); + if (!VOID_TYPE_P (result_type)) + { + result_rtl = arm_function_value (result_type, current_function_decl, 0); + + /* No need to check that we return in registers, because we don't + support returning on stack yet. */ + to_clear_mask[0] + &= ~compute_not_to_clear_mask (result_type, result_rtl, 0, + padding_bits_to_clear_ptr); + } + + if (padding_bits_to_clear != 0) + { + rtx reg_rtx; + /* Padding bits to clear is not 0 so we know we are dealing with + returning a composite type, which only uses r0. Let's make sure that + r1-r3 is cleared too, we will use r1 as a scratch register. */ + gcc_assert ((to_clear_mask[0] & 0xe) == 0xe); + + reg_rtx = gen_rtx_REG (SImode, R1_REGNUM); + + /* Fill the lower half of the negated padding_bits_to_clear. */ + emit_move_insn (reg_rtx, + GEN_INT ((((~padding_bits_to_clear) << 16u) >> 16u))); + + /* Also fill the top half of the negated padding_bits_to_clear. */ + if (((~padding_bits_to_clear) >> 16) > 0) + emit_insn (gen_rtx_SET (gen_rtx_ZERO_EXTRACT (SImode, reg_rtx, + GEN_INT (16), + GEN_INT (16)), + GEN_INT ((~padding_bits_to_clear) >> 16))); + + emit_insn (gen_andsi3 (gen_rtx_REG (SImode, R0_REGNUM), + gen_rtx_REG (SImode, R0_REGNUM), + reg_rtx)); + } + + for (regno = R0_REGNUM; regno <= maxregno; regno++) + { + if (!(to_clear_mask[regno / 64] & (1ULL << (regno % 64)))) + continue; + + if (IS_VFP_REGNUM (regno)) + { + /* If regno is an even vfp register and its successor is also to + be cleared, use vmov. */ + if (TARGET_VFP_DOUBLE + && VFP_REGNO_OK_FOR_DOUBLE (regno) + && to_clear_mask[regno / 64] & (1ULL << ((regno % 64) + 1))) + { + emit_move_insn (gen_rtx_REG (DFmode, regno), + CONST1_RTX (DFmode)); + emit_use (gen_rtx_REG (DFmode, regno)); + regno++; + } + else + { + emit_move_insn (gen_rtx_REG (SFmode, regno), + CONST1_RTX (SFmode)); + emit_use (gen_rtx_REG (SFmode, regno)); + } + } + else + { + if (TARGET_THUMB1) + { + if (regno == R0_REGNUM) + emit_move_insn (gen_rtx_REG (SImode, regno), + const0_rtx); + else + /* R0 has either been cleared before, see code above, or it + holds a return value, either way it is not secret + information. */ + emit_move_insn (gen_rtx_REG (SImode, regno), + gen_rtx_REG (SImode, R0_REGNUM)); + emit_use (gen_rtx_REG (SImode, regno)); + } + else + { + emit_move_insn (gen_rtx_REG (SImode, regno), + gen_rtx_REG (SImode, LR_REGNUM)); + emit_use (gen_rtx_REG (SImode, regno)); + } + } + } +} + /* Generate pattern *pop_multiple_with_stack_update_and_return if single POP instruction can be generated. LR should be replaced by PC. All the checks required are already done by USE_RETURN_INSN (). Hence, @@ -24179,6 +24645,8 @@ thumb2_expand_return (bool simple_return) } else { + if (IS_CMSE_ENTRY (arm_current_func_type ())) + cmse_nonsecure_entry_clear_before_return (); emit_jump_insn (simple_return_rtx); } } @@ -24237,6 +24705,10 @@ thumb1_expand_epilogue (void) if (! df_regs_ever_live_p (LR_REGNUM)) emit_use (gen_rtx_REG (SImode, LR_REGNUM)); + + /* Clear all caller-saved regs that are not used to return. */ + if (IS_CMSE_ENTRY (arm_current_func_type ())) + cmse_nonsecure_entry_clear_before_return (); } /* Epilogue code for APCS frame. */ @@ -24671,6 +25143,14 @@ arm_expand_epilogue (bool really_return) stack_pointer_rtx, stack_pointer_rtx); } + /* Clear all caller-saved regs that are not used to return. */ + if (IS_CMSE_ENTRY (arm_current_func_type ())) + { + /* CMSE_ENTRY always returns. */ + gcc_assert (really_return); + cmse_nonsecure_entry_clear_before_return (); + } + if (!really_return) return; diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md index 5f0dffb..73a7381 100644 --- a/gcc/config/arm/thumb1.md +++ b/gcc/config/arm/thumb1.md @@ -1869,8 +1869,13 @@ "* return thumb1_unexpanded_epilogue (); " - ; Length is absolute worst case - [(set_attr "length" "44") + ; Length is absolute worst case, when using CMSE and if this is an entry + ; function an extra 4 (MSR) bytes will be added. + [(set (attr "length") + (if_then_else + (match_test "IS_CMSE_ENTRY (arm_current_func_type ())") + (const_int 48) + (const_int 44))) (set_attr "type" "block") ;; We don't clobber the conditions, but the potential length of this ;; operation is sufficient to make conditionalizing the sequence diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index affcd83..9029a2f 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1114,12 +1114,31 @@ (define_insn "*thumb2_return" [(simple_return)] - "TARGET_THUMB2" + "TARGET_THUMB2 && !IS_CMSE_ENTRY (arm_current_func_type ())" "* return output_return_instruction (const_true_rtx, true, false, true);" [(set_attr "type" "branch") (set_attr "length" "4")] ) +(define_insn "*thumb2_cmse_entry_return" + [(simple_return)] + "TARGET_THUMB2 && IS_CMSE_ENTRY (arm_current_func_type ())" + "* return output_return_instruction (const_true_rtx, true, false, true);" + [(set_attr "type" "branch") + ; This is a return from a cmse_nonsecure_entry function so code will be + ; added to clear the APSR and potentially the FPSCR if VFP is available, so + ; we adapt the length accordingly. + (set (attr "length") + (if_then_else (match_test "TARGET_HARD_FLOAT") + (const_int 12) + (const_int 8))) + ; We do not support predicate execution of returns from cmse_nonsecure_entry + ; functions because we need to clear the APSR. Since predicable has to be + ; a constant, we had to duplicate the thumb2_return pattern for CMSE entry + ; functions. + (set_attr "predicable" "no")] +) + (define_insn_and_split "thumb2_eh_return" [(unspec_volatile [(match_operand:SI 0 "s_register_operand" "r")] VUNSPEC_EH_RETURN) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index e31a1b3..7cb1616 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,6 +1,22 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * gcc.target/arm/cmse/cmse.exp: Test different multilibs separate. + * gcc.target/arm/cmse/struct-1.c: New. + * gcc.target/arm/cmse/bitfield-1.c: New. + * gcc.target/arm/cmse/bitfield-2.c: New. + * gcc.target/arm/cmse/bitfield-3.c: New. + * gcc.target/arm/cmse/baseline/cmse-2.c: New. + * gcc.target/arm/cmse/baseline/softfp.c: New. + * gcc.target/arm/cmse/mainline/soft/cmse-5.c: New. + * gcc.target/arm/cmse/mainline/hard/cmse-5.c: New. + * gcc.target/arm/cmse/mainline/hard-sp/cmse-5.c: New. + * gcc.target/arm/cmse/mainline/softfp/cmse-5.c: New. + * gcc.target/arm/cmse/mainline/softfp-sp/cmse-5.c: New. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * gcc.target/arm/cmse/cmse-4.c: New. * gcc.target/arm/cmse/cmse-9.c: New. * gcc.target/arm/cmse/cmse-10.c: New. diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-2.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-2.c new file mode 100644 index 0000000..814502d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-2.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_base_ok } */ +/* { dg-add-options arm_arch_v8m_base } */ +/* { dg-options "-mcmse" } */ + +extern float bar (void); + +float __attribute__ ((cmse_nonsecure_entry)) +foo (void) +{ + return bar (); +} +/* { dg-final { scan-assembler "movs\tr1, r0" } } */ +/* { dg-final { scan-assembler "movs\tr2, r0" } } */ +/* { dg-final { scan-assembler "movs\tr3, r0" } } */ +/* { dg-final { scan-assembler "mov\tip, r0" } } */ +/* { dg-final { scan-assembler "mov\tlr, r0" } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvq," } } */ +/* { dg-final { scan-assembler "bxns" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/softfp.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/softfp.c new file mode 100644 index 0000000..0069fcd --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/softfp.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_base_ok } */ +/* { dg-add-options arm_arch_v8m_base } */ +/* { dg-options "-mcmse -mfloat-abi=softfp" } */ + +double __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double); + +double +foo (double a) +{ + return bar (1.0f, 2.0) + a; +} + +float __attribute__ ((cmse_nonsecure_entry)) +baz (float a, double b) +{ + return (float) bar (a, b); +} + +/* Make sure we are not using FP instructions, since ARMv8-M Baseline does not + support such instructions. */ +/* { dg-final { scan-assembler-not "vmov" } } */ +/* { dg-final { scan-assembler-not "vmsr" } } */ +/* { dg-final { scan-assembler-not "vmrs" } } */ + +/* Just double checking that we are still doing cmse though. */ +/* { dg-final { scan-assembler-not "vmrs" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/bitfield-1.c b/gcc/testsuite/gcc.target/arm/cmse/bitfield-1.c new file mode 100644 index 0000000..fccc51d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/bitfield-1.c @@ -0,0 +1,39 @@ +/* { dg-do run } */ +/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */ + +typedef struct +{ + unsigned short a : 6; + unsigned char b : 3; + unsigned char c; + unsigned short d : 8; +} test_st; + +test_st __attribute__ ((cmse_nonsecure_entry)) foo (void) +{ + test_st t; + t.a = 63u; + t.b = 7u; + t.c = 255u; + t.d = 255u; + return t; +} + +int +main (void) +{ + test_st t; + t = foo (); + if (t.a != 63u + || t.b != 7u + || t.c != 255u + || t.d != 255u) + __builtin_abort (); + return 0; +} + +/* { dg-final { scan-assembler "movw\tr1, #1855" } } */ +/* { dg-final { scan-assembler "movt\tr1, 65535" } } */ +/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */ +/* { dg-final { scan-assembler "bxns" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/bitfield-2.c b/gcc/testsuite/gcc.target/arm/cmse/bitfield-2.c new file mode 100644 index 0000000..e6aee3c --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/bitfield-2.c @@ -0,0 +1,36 @@ +/* { dg-do run } */ +/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */ + +typedef struct +{ + short a : 7; + signed char b : 3; + short c : 11; +} test_st; + +test_st __attribute__ ((cmse_nonsecure_entry)) foo (void) +{ + test_st t; + t.a = -64; + t.b = -4 ; + t.c = -1024; + return t; +} + +int +main (void) +{ + test_st t; + t = foo (); + if (t.a != -64 + || t.b != -4 + || t.c != -1024) + __builtin_abort (); + return 0; +} + +/* { dg-final { scan-assembler "movw\tr1, #1919" } } */ +/* { dg-final { scan-assembler "movt\tr1, 2047" } } */ +/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */ +/* { dg-final { scan-assembler "bxns" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/bitfield-3.c b/gcc/testsuite/gcc.target/arm/cmse/bitfield-3.c new file mode 100644 index 0000000..285a2b9 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/bitfield-3.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */ + +typedef struct +{ + short a; + signed char b : 2; + short : 1; + signed char c : 3; +} test_st; + +test_st __attribute__ ((cmse_nonsecure_entry)) foo (void) +{ + test_st t; + t.a = -32768; + t.b = -2; + t.c = -4; + return t; +} + +int +main (void) +{ + test_st t; + t = foo (); + if (t.a != -32768 + || t.b != -2 + || t.c != -4) + __builtin_abort (); + return 0; +} + +/* { dg-final { scan-assembler "movw\tr1, #65535" } } */ +/* { dg-final { scan-assembler "movt\tr1, 63" } } */ +/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */ +/* { dg-final { scan-assembler "bxns" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse.exp b/gcc/testsuite/gcc.target/arm/cmse/cmse.exp index f797dba..38f1841 100644 --- a/gcc/testsuite/gcc.target/arm/cmse/cmse.exp +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse.exp @@ -43,6 +43,26 @@ set LTO_TORTURE_OPTIONS "" gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] \ "" $DEFAULT_CFLAGS +if {[check_effective_target_arm_arch_v8m_base_ok]} then { + # Baseline only + gcc-dg-runtest [lsort [glob $srcdir/$subdir/baseline/*.c]] \ + "" $DEFAULT_CFLAGS +} + +if {[check_effective_target_arm_arch_v8m_main_ok]} then { + # Mainline -mfloat-abi=soft + gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/soft/*.c]] \ + "-mfloat-abi=soft" $DEFAULT_CFLAGS + gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/softfp/*.c]] \ + "" $DEFAULT_CFLAGS + gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/softfp-sp/*.c]] \ + "" $DEFAULT_CFLAGS + gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/hard/*.c]] \ + "" $DEFAULT_CFLAGS + gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/hard-sp/*.c]] \ + "" $DEFAULT_CFLAGS +} + set LTO_TORTURE_OPTIONS ${saved-lto_torture_options} set dg-do-what-default ${saved-dg-do-what-default} diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-5.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-5.c new file mode 100644 index 0000000..88dec27 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-5.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */ +/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" } */ + +extern float bar (void); + +float __attribute__ ((cmse_nonsecure_entry)) +foo (void) +{ + return bar (); +} +/* { dg-final { scan-assembler "mov\tr0, lr" } } */ +/* { dg-final { scan-assembler "mov\tr1, lr" } } */ +/* { dg-final { scan-assembler "mov\tr2, lr" } } */ +/* { dg-final { scan-assembler "mov\tr3, lr" } } */ +/* { dg-final { scan-assembler-not "vmov\.f32\ts0, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts2, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts3, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts4, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts5, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts6, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts7, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts8, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts9, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts10, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts11, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts12, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts13, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts14, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts15, #1\.0" } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */ +/* { dg-final { scan-assembler "push\t{r4}" } } */ +/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */ +/* { dg-final { scan-assembler "movw\tr4, #65376" } } */ +/* { dg-final { scan-assembler "movt\tr4, #4095" } } */ +/* { dg-final { scan-assembler "and\tip, r4" } } */ +/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */ +/* { dg-final { scan-assembler "pop\t{r4}" } } */ +/* { dg-final { scan-assembler "mov\tip, lr" } } */ +/* { dg-final { scan-assembler "bxns" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-5.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-5.c new file mode 100644 index 0000000..29f60ba --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-5.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */ +/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" } */ + +extern float bar (void); + +float __attribute__ ((cmse_nonsecure_entry)) +foo (void) +{ + return bar (); +} +/* { dg-final { scan-assembler "mov\tr0, lr" } } */ +/* { dg-final { scan-assembler "mov\tr1, lr" } } */ +/* { dg-final { scan-assembler "mov\tr2, lr" } } */ +/* { dg-final { scan-assembler "mov\tr3, lr" } } */ +/* { dg-final { scan-assembler-not "vmov\.f32\ts0, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td1, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td2, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td3, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td4, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td5, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td6, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td7, #1\.0" } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */ +/* { dg-final { scan-assembler "push\t{r4}" } } */ +/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */ +/* { dg-final { scan-assembler "movw\tr4, #65376" } } */ +/* { dg-final { scan-assembler "movt\tr4, #4095" } } */ +/* { dg-final { scan-assembler "and\tip, r4" } } */ +/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */ +/* { dg-final { scan-assembler "pop\t{r4}" } } */ +/* { dg-final { scan-assembler "mov\tip, lr" } } */ +/* { dg-final { scan-assembler "bxns" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-5.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-5.c new file mode 100644 index 0000000..a7229ea --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-5.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */ +/* { dg-options "-mcmse -mfloat-abi=soft" } */ + +extern float bar (void); + +float __attribute__ ((cmse_nonsecure_entry)) +foo (void) +{ + return bar (); +} + +/* { dg-final { scan-assembler "mov\tr1, lr" } } */ +/* { dg-final { scan-assembler "mov\tr2, lr" } } */ +/* { dg-final { scan-assembler "mov\tr3, lr" } } */ +/* { dg-final { scan-assembler "mov\tip, lr" } } */ +/* { dg-final { scan-assembler-not "vmov" } } */ +/* { dg-final { scan-assembler-not "vmsr" } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */ +/* { dg-final { scan-assembler "bxns" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-5.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-5.c new file mode 100644 index 0000000..7734d77 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-5.c @@ -0,0 +1,46 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */ +/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" } */ + +extern float bar (void); + +float __attribute__ ((cmse_nonsecure_entry)) +foo (void) +{ + return bar (); +} +/* { dg-final { scan-assembler "__acle_se_foo:" } } */ +/* { dg-final { scan-assembler-not "mov\tr0, lr" } } */ +/* { dg-final { scan-assembler "mov\tr1, lr" } } */ +/* { dg-final { scan-assembler "mov\tr2, lr" } } */ +/* { dg-final { scan-assembler "mov\tr3, lr" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts0, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts1, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts2, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts3, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts4, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts5, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts6, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts7, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts8, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts9, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts10, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts11, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts12, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts13, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts14, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f32\ts15, #1\.0" } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */ +/* { dg-final { scan-assembler "push\t{r4}" } } */ +/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */ +/* { dg-final { scan-assembler "movw\tr4, #65376" } } */ +/* { dg-final { scan-assembler "movt\tr4, #4095" } } */ +/* { dg-final { scan-assembler "and\tip, r4" } } */ +/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */ +/* { dg-final { scan-assembler "pop\t{r4}" } } */ +/* { dg-final { scan-assembler "mov\tip, lr" } } */ +/* { dg-final { scan-assembler "bxns" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-5.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-5.c new file mode 100644 index 0000000..6addaa1 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-5.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */ +/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" } */ + +extern float bar (void); + +float __attribute__ ((cmse_nonsecure_entry)) +foo (void) +{ + return bar (); +} +/* { dg-final { scan-assembler "__acle_se_foo:" } } */ +/* { dg-final { scan-assembler-not "mov\tr0, lr" } } */ +/* { dg-final { scan-assembler "mov\tr1, lr" } } */ +/* { dg-final { scan-assembler "mov\tr2, lr" } } */ +/* { dg-final { scan-assembler "mov\tr3, lr" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td0, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td1, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td2, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td3, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td4, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td5, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td6, #1\.0" } } */ +/* { dg-final { scan-assembler "vmov\.f64\td7, #1\.0" } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvq, lr" { target { arm_arch_v8m_main_ok && { ! arm_dsp } } } } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvqg, lr" { target { arm_arch_v8m_main_ok && arm_dsp } } } } */ +/* { dg-final { scan-assembler "push\t{r4}" } } */ +/* { dg-final { scan-assembler "vmrs\tip, fpscr" } } */ +/* { dg-final { scan-assembler "movw\tr4, #65376" } } */ +/* { dg-final { scan-assembler "movt\tr4, #4095" } } */ +/* { dg-final { scan-assembler "and\tip, r4" } } */ +/* { dg-final { scan-assembler "vmsr\tfpscr, ip" } } */ +/* { dg-final { scan-assembler "pop\t{r4}" } } */ +/* { dg-final { scan-assembler "mov\tip, lr" } } */ +/* { dg-final { scan-assembler "bxns" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/struct-1.c b/gcc/testsuite/gcc.target/arm/cmse/struct-1.c new file mode 100644 index 0000000..2d366a9 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/struct-1.c @@ -0,0 +1,33 @@ +/* { dg-do run } */ +/* { dg-options "--save-temps -mcmse -Wl,--section-start,.gnu.sgstubs=0x20400000" } */ + +typedef struct +{ + unsigned char a; + unsigned short b; +} test_st; + +test_st __attribute__ ((cmse_nonsecure_entry)) foo (void) +{ + test_st t; + t.a = 255u; + t.b = 32767u; + return t; +} + +int +main (void) +{ + test_st t; + t = foo (); + if (t.a != 255u || t.b != 32767u) + __builtin_abort (); + return 0; +} + +/* { dg-final { scan-assembler "movs\tr1, #255" } } */ +/* { dg-final { scan-assembler "movt\tr1, 65535" } } */ +/* { dg-final { scan-assembler "ands\tr0(, r0)?, r1" } } */ +/* { dg-final { scan-assembler "bxns" } } */ + + -- cgit v1.1 From 32ce1e4f244830404328e5a45d062c2f5bee662d Mon Sep 17 00:00:00 2001 From: Andre Vieira Date: Fri, 2 Dec 2016 15:30:37 +0000 Subject: Handling ARMv8-M Security Extension's cmse_nonsecure_call attribute gcc/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * config/arm/arm.c (gimplify.h): New include. (arm_handle_cmse_nonsecure_call): New. (arm_attribute_table): Added cmse_nonsecure_call. (arm_comp_type_attributes): Deny compatibility of function types with without the cmse_nonsecure_call attribute. * doc/extend.texi (ARM ARMv8-M Security Extensions): New attribute. gcc/testsuite/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * gcc.target/arm/cmse/cmse-3.c: Add tests. * gcc.target/arm/cmse/cmse-4.c: Add tests. * gcc.target/arm/cmse/cmse-15.c: New. Co-Authored-By: Thomas Preud'homme From-SVN: r243191 --- gcc/ChangeLog | 10 ++++ gcc/config/arm/arm.c | 84 +++++++++++++++++++++++++++++ gcc/doc/extend.texi | 4 +- gcc/testsuite/ChangeLog | 7 +++ gcc/testsuite/gcc.target/arm/cmse/cmse-15.c | 72 +++++++++++++++++++++++++ gcc/testsuite/gcc.target/arm/cmse/cmse-3.c | 8 +++ gcc/testsuite/gcc.target/arm/cmse/cmse-4.c | 7 +++ 7 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/cmse/cmse-15.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index bb1e524..ce79fdd 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,6 +1,16 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * config/arm/arm.c (gimplify.h): New include. + (arm_handle_cmse_nonsecure_call): New. + (arm_attribute_table): Added cmse_nonsecure_call. + (arm_comp_type_attributes): Deny compatibility of function types + with without the cmse_nonsecure_call attribute. + * doc/extend.texi (ARM ARMv8-M Security Extensions): New attribute. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * config/arm/arm.c (output_return_instruction): Clear registers. (thumb2_expand_return): Likewise. diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 6a9db85..a6b07b2 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -63,6 +63,7 @@ #include "tm-constrs.h" #include "rtl-iter.h" #include "optabs-libfuncs.h" +#include "gimplify.h" /* This file should be included last. */ #include "target-def.h" @@ -137,6 +138,7 @@ static tree arm_handle_isr_attribute (tree *, tree, tree, int, bool *); static tree arm_handle_notshared_attribute (tree *, tree, tree, int, bool *); #endif static tree arm_handle_cmse_nonsecure_entry (tree *, tree, tree, int, bool *); +static tree arm_handle_cmse_nonsecure_call (tree *, tree, tree, int, bool *); static void arm_output_function_epilogue (FILE *, HOST_WIDE_INT); static void arm_output_function_prologue (FILE *, HOST_WIDE_INT); static int arm_comp_type_attributes (const_tree, const_tree); @@ -348,6 +350,8 @@ static const struct attribute_spec arm_attribute_table[] = /* ARMv8-M Security Extensions support. */ { "cmse_nonsecure_entry", 0, 0, true, false, false, arm_handle_cmse_nonsecure_entry, false }, + { "cmse_nonsecure_call", 0, 0, true, false, false, + arm_handle_cmse_nonsecure_call, true }, { NULL, 0, 0, false, false, false, NULL, false } }; @@ -6753,6 +6757,78 @@ arm_handle_cmse_nonsecure_entry (tree *node, tree name, return NULL_TREE; } + +/* Called upon detection of the use of the cmse_nonsecure_call attribute, this + function will check whether the attribute is allowed here and will add the + attribute to the function type tree or otherwise issue a diagnostic. The + reason we check this at declaration time is to only allow the use of the + attribute with declarations of function pointers and not function + declarations. This function checks NODE is of the expected type and issues + diagnostics otherwise using NAME. If it is not of the expected type + *NO_ADD_ATTRS will be set to true. */ + +static tree +arm_handle_cmse_nonsecure_call (tree *node, tree name, + tree /* args */, + int /* flags */, + bool *no_add_attrs) +{ + tree decl = NULL_TREE, fntype = NULL_TREE; + tree main_variant, type; + + if (!use_cmse) + { + *no_add_attrs = true; + warning (OPT_Wattributes, "%qE attribute ignored without -mcmse option.", + name); + return NULL_TREE; + } + + if (TREE_CODE (*node) == VAR_DECL || TREE_CODE (*node) == TYPE_DECL) + { + decl = *node; + fntype = TREE_TYPE (decl); + } + + while (fntype != NULL_TREE && TREE_CODE (fntype) == POINTER_TYPE) + fntype = TREE_TYPE (fntype); + + if (!decl || TREE_CODE (fntype) != FUNCTION_TYPE) + { + warning (OPT_Wattributes, "%qE attribute only applies to base type of a " + "function pointer", name); + *no_add_attrs = true; + return NULL_TREE; + } + + *no_add_attrs |= cmse_func_args_or_return_in_stack (NULL, name, fntype); + + if (*no_add_attrs) + return NULL_TREE; + + /* Prevent trees being shared among function types with and without + cmse_nonsecure_call attribute. */ + type = TREE_TYPE (decl); + + type = build_distinct_type_copy (type); + TREE_TYPE (decl) = type; + fntype = type; + + while (TREE_CODE (fntype) != FUNCTION_TYPE) + { + type = fntype; + fntype = TREE_TYPE (fntype); + fntype = build_distinct_type_copy (fntype); + TREE_TYPE (type) = fntype; + } + + /* Construct a type attribute and add it to the function type. */ + tree attrs = tree_cons (get_identifier ("cmse_nonsecure_call"), NULL_TREE, + TYPE_ATTRIBUTES (fntype)); + TYPE_ATTRIBUTES (fntype) = attrs; + return NULL_TREE; +} + /* Return 0 if the attributes for two types are incompatible, 1 if they are compatible, and 2 if they are nearly compatible (which causes a warning to be generated). */ @@ -6793,6 +6869,14 @@ arm_comp_type_attributes (const_tree type1, const_tree type2) if (l1 != l2) return 0; + l1 = lookup_attribute ("cmse_nonsecure_call", + TYPE_ATTRIBUTES (type1)) != NULL; + l2 = lookup_attribute ("cmse_nonsecure_call", + TYPE_ATTRIBUTES (type2)) != NULL; + + if (l1 != l2) + return 0; + return 1; } diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index d72af3e..0fa59ff 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -12648,8 +12648,8 @@ Security Extensions: Requiremenets on Development Tools Engineering Specification, which can be found at @uref{http://infocenter.arm.com/help/topic/com.arm.doc.ecm0359818/ECM0359818_armv8m_security_extensions_reqs_on_dev_tools_1_0.pdf}. -As part of the Security Extensions GCC implements a new function attribute -@code{cmse_nonsecure_entry}. +As part of the Security Extensions GCC implements two new function attributes: +@code{cmse_nonsecure_entry} and @code{cmse_nonsecure_call}. As part of the Security Extensions GCC implements the intrinsics below. FPTR is used here to mean any function pointer type. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 7cb1616..ca431e4 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,6 +1,13 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * gcc.target/arm/cmse/cmse-3.c: Add tests. + * gcc.target/arm/cmse/cmse-4.c: Add tests. + * gcc.target/arm/cmse/cmse-15.c: New. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * gcc.target/arm/cmse/cmse.exp: Test different multilibs separate. * gcc.target/arm/cmse/struct-1.c: New. * gcc.target/arm/cmse/bitfield-1.c: New. diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-15.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-15.c new file mode 100644 index 0000000..4e9ace1 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-15.c @@ -0,0 +1,72 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*ns_foo) (void); +int (*s_bar) (void); +int __attribute__ ((cmse_nonsecure_call)) (**ns_foo2) (void); +int (**s_bar2) (void); + +typedef int __attribute__ ((cmse_nonsecure_call)) ns_foo_t (void); +typedef int s_bar_t (void); +typedef int __attribute__ ((cmse_nonsecure_call)) (* ns_foo_ptr) (void); +typedef int (*s_bar_ptr) (void); + +int nonsecure0 (ns_foo_t * ns_foo_p) +{ + return ns_foo_p (); +} + +int nonsecure1 (ns_foo_t ** ns_foo_p) +{ + return (*ns_foo_p) (); +} + +int nonsecure2 (ns_foo_ptr ns_foo_p) +{ + return ns_foo_p (); +} +int nonsecure3 (ns_foo_ptr * ns_foo_p) +{ + return (*ns_foo_p) (); +} + +int secure0 (s_bar_t * s_bar_p) +{ + return s_bar_p (); +} + +int secure1 (s_bar_t ** s_bar_p) +{ + return (*s_bar_p) (); +} + +int secure2 (s_bar_ptr s_bar_p) +{ + return s_bar_p (); +} + +int secure3 (s_bar_ptr * s_bar_p) +{ + return (*s_bar_p) (); +} + +int nonsecure4 (void) +{ + return ns_foo (); +} + +int nonsecure5 (void) +{ + return (*ns_foo2) (); +} + +int secure4 (void) +{ + return s_bar (); +} + +int secure5 (void) +{ + return (*s_bar2) (); +} +/* { dg-final { scan-assembler-times "bl\\s+__gnu_cmse_nonsecure_call" 6 } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-3.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-3.c index 2c2920e..7f92a4c 100644 --- a/gcc/testsuite/gcc.target/arm/cmse/cmse-3.c +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-3.c @@ -35,3 +35,11 @@ norf (struct span2 a) {} void __attribute__ ((cmse_nonsecure_entry)) foo2 (long long a, int b, union test_union c) {} /* { dg-error "not available to functions with arguments passed on the stack" } */ + +typedef void __attribute__ ((cmse_nonsecure_call)) bar2 (long long a, int b, long long c); /* { dg-error "not available to functions with arguments passed on the stack" } */ + +typedef void __attribute__ ((cmse_nonsecure_call)) baz2 (long long a, int b, struct span c); /* { dg-error "not available to functions with arguments passed on the stack" } */ + +typedef struct span __attribute__ ((cmse_nonsecure_call)) qux2 (void); /* { dg-error "not available to functions that return value on the stack" } */ + +typedef void __attribute__ ((cmse_nonsecure_call)) norf2 (int a, ...); /* { dg-error "not available to functions with variable number of arguments" } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-4.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-4.c index 6f930ab..d0999a4 100644 --- a/gcc/testsuite/gcc.target/arm/cmse/cmse-4.c +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-4.c @@ -19,9 +19,16 @@ baz (void) return qux (); } +void __attribute__ ((cmse_nonsecure_call)) +quux (void) {} /* { dg-warning "attribute only applies to base type of a function pointer" } */ + +int __attribute__ ((cmse_nonsecure_call)) norf; /* { dg-warning "attribute only applies to base type of a function pointer" } */ + /* { dg-final { scan-assembler-times "bxns" 2 } } */ /* { dg-final { scan-assembler "foo:" } } */ /* { dg-final { scan-assembler "__acle_se_foo:" } } */ /* { dg-final { scan-assembler-not "__acle_se_bar:" } } */ /* { dg-final { scan-assembler "baz:" } } */ /* { dg-final { scan-assembler "__acle_se_baz:" } } */ +/* { dg-final { scan-assembler-not "__acle_se_quux:" } } */ +/* { dg-final { scan-assembler-not "__acle_se_norf:" } } */ -- cgit v1.1 From c92e08e3d766baf88c7507cd5224d4d241ff8d39 Mon Sep 17 00:00:00 2001 From: Andre Vieira Date: Fri, 2 Dec 2016 15:33:26 +0000 Subject: ARMv8-M Security Extension's cmse_nonsecure_call: use __gnu_cmse_nonsecure_call gcc/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * config/arm/arm.c (detect_cmse_nonsecure_call): New. (cmse_nonsecure_call_clear_caller_saved): New. (arm_reorg): Use cmse_nonsecure_call_clear_caller_saved. (arm_function_ok_for_sibcall): Disable sibcalls for cmse_nonsecure_call. * config/arm/arm-protos.h (detect_cmse_nonsecure_call): New. * config/arm/arm.md (call): Handle cmse_nonsecure_entry. (call_value): Likewise. (nonsecure_call_internal): New. (nonsecure_call_value_internal): New. * config/arm/thumb1.md (*nonsecure_call_reg_thumb1_v5): New. (*nonsecure_call_value_reg_thumb1_v5): New. * config/arm/thumb2.md (*nonsecure_call_reg_thumb2): New. (*nonsecure_call_value_reg_thumb2): New. * config/arm/unspecs.md (UNSPEC_NONSECURE_MEM): New. libgcc/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * config/arm/cmse_nonsecure_call.S: New. * config/arm/t-arm: Compile cmse_nonsecure_call.S gcc/testsuite/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * gcc.target/arm/cmse/cmse.exp: Run tests in mainline dir. * gcc.target/arm/cmse/cmse-9.c: Added some extra tests. * gcc.target/arm/cmse/cmse-14.c: New. * gcc.target/arm/cmse/baseline/bitfield-4.c: New. * gcc.target/arm/cmse/baseline/bitfield-5.c: New. * gcc.target/arm/cmse/baseline/bitfield-6.c: New. * gcc.target/arm/cmse/baseline/bitfield-7.c: New. * gcc.target/arm/cmse/baseline/bitfield-8.c: New. * gcc.target/arm/cmse/baseline/bitfield-9.c: New. * gcc.target/arm/cmse/baseline/bitfield-and-union-1.c: New. * gcc.target/arm/cmse/baseline/cmse-11.c: New. * gcc.target/arm/cmse/baseline/cmse-13.c: New. * gcc.target/arm/cmse/baseline/cmse-6.c: New. * gcc.target/arm/cmse/baseline/union-1.c: New. * gcc.target/arm/cmse/baseline/union-2.c: New. * gcc.target/arm/cmse/mainline/bitfield-4.c: New. * gcc.target/arm/cmse/mainline/bitfield-5.c: New. * gcc.target/arm/cmse/mainline/bitfield-6.c: New. * gcc.target/arm/cmse/mainline/bitfield-7.c: New. * gcc.target/arm/cmse/mainline/bitfield-8.c: New. * gcc.target/arm/cmse/mainline/bitfield-9.c: New. * gcc.target/arm/cmse/mainline/bitfield-and-union-1.c: New. * gcc.target/arm/cmse/mainline/union-1.c: New. * gcc.target/arm/cmse/mainline/union-2.c: New. * gcc.target/arm/cmse/mainline/hard-sp/cmse-13.c: New. * gcc.target/arm/cmse/mainline/hard-sp/cmse-7.c: New. * gcc.target/arm/cmse/mainline/hard-sp/cmse-8.c: New. * gcc.target/arm/cmse/mainline/hard/cmse-13.c: New. * gcc.target/arm/cmse/mainline/hard/cmse-7.c: New. * gcc.target/arm/cmse/mainline/hard/cmse-8.c: New. * gcc.target/arm/cmse/mainline/soft/cmse-13.c: New. * gcc.target/arm/cmse/mainline/soft/cmse-7.c: New. * gcc.target/arm/cmse/mainline/soft/cmse-8.c: New. * gcc.target/arm/cmse/mainline/softfp-sp/cmse-7.c: New. * gcc.target/arm/cmse/mainline/softfp-sp/cmse-8.c: New. * gcc.target/arm/cmse/mainline/softfp/cmse-13.c: New. * gcc.target/arm/cmse/mainline/softfp/cmse-7.c: New. * gcc.target/arm/cmse/mainline/softfp/cmse-8.c: New. Co-Authored-By: Thomas Preud'homme From-SVN: r243192 --- gcc/ChangeLog | 19 ++ gcc/config/arm/arm-protos.h | 1 + gcc/config/arm/arm.c | 219 +++++++++++++++++++++ gcc/config/arm/arm.md | 67 ++++++- gcc/config/arm/thumb1.md | 28 +++ gcc/config/arm/thumb2.md | 28 +++ gcc/config/arm/unspecs.md | 2 + gcc/testsuite/ChangeLog | 42 ++++ .../gcc.target/arm/cmse/baseline/bitfield-4.c | 57 ++++++ .../gcc.target/arm/cmse/baseline/bitfield-5.c | 53 +++++ .../gcc.target/arm/cmse/baseline/bitfield-6.c | 63 ++++++ .../gcc.target/arm/cmse/baseline/bitfield-7.c | 54 +++++ .../gcc.target/arm/cmse/baseline/bitfield-8.c | 57 ++++++ .../gcc.target/arm/cmse/baseline/bitfield-9.c | 56 ++++++ .../arm/cmse/baseline/bitfield-and-union-1.c | 96 +++++++++ .../gcc.target/arm/cmse/baseline/cmse-11.c | 22 +++ .../gcc.target/arm/cmse/baseline/cmse-13.c | 25 +++ .../gcc.target/arm/cmse/baseline/cmse-6.c | 21 ++ .../gcc.target/arm/cmse/baseline/union-1.c | 71 +++++++ .../gcc.target/arm/cmse/baseline/union-2.c | 86 ++++++++ gcc/testsuite/gcc.target/arm/cmse/cmse-14.c | 13 ++ gcc/testsuite/gcc.target/arm/cmse/cmse-9.c | 10 +- gcc/testsuite/gcc.target/arm/cmse/cmse.exp | 2 + .../gcc.target/arm/cmse/mainline/bitfield-4.c | 55 ++++++ .../gcc.target/arm/cmse/mainline/bitfield-5.c | 51 +++++ .../gcc.target/arm/cmse/mainline/bitfield-6.c | 61 ++++++ .../gcc.target/arm/cmse/mainline/bitfield-7.c | 52 +++++ .../gcc.target/arm/cmse/mainline/bitfield-8.c | 55 ++++++ .../gcc.target/arm/cmse/mainline/bitfield-9.c | 54 +++++ .../arm/cmse/mainline/bitfield-and-union-1.c | 94 +++++++++ .../gcc.target/arm/cmse/mainline/hard-sp/cmse-13.c | 43 ++++ .../gcc.target/arm/cmse/mainline/hard-sp/cmse-7.c | 42 ++++ .../gcc.target/arm/cmse/mainline/hard-sp/cmse-8.c | 41 ++++ .../gcc.target/arm/cmse/mainline/hard/cmse-13.c | 38 ++++ .../gcc.target/arm/cmse/mainline/hard/cmse-7.c | 34 ++++ .../gcc.target/arm/cmse/mainline/hard/cmse-8.c | 33 ++++ .../gcc.target/arm/cmse/mainline/soft/cmse-13.c | 27 +++ .../gcc.target/arm/cmse/mainline/soft/cmse-7.c | 27 +++ .../gcc.target/arm/cmse/mainline/soft/cmse-8.c | 26 +++ .../arm/cmse/mainline/softfp-sp/cmse-7.c | 26 +++ .../arm/cmse/mainline/softfp-sp/cmse-8.c | 25 +++ .../gcc.target/arm/cmse/mainline/softfp/cmse-13.c | 25 +++ .../gcc.target/arm/cmse/mainline/softfp/cmse-7.c | 26 +++ .../gcc.target/arm/cmse/mainline/softfp/cmse-8.c | 25 +++ .../gcc.target/arm/cmse/mainline/union-1.c | 69 +++++++ .../gcc.target/arm/cmse/mainline/union-2.c | 84 ++++++++ libgcc/ChangeLog | 6 + libgcc/config/arm/cmse_nonsecure_call.S | 131 ++++++++++++ libgcc/config/arm/t-arm | 2 + 49 files changed, 2238 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-4.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-5.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-6.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-7.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-8.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-9.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-and-union-1.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-11.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-13.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-6.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/union-1.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/baseline/union-2.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/cmse-14.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-4.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-5.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-6.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-7.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-8.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-9.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-and-union-1.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-13.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-7.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-8.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-13.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-7.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-8.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-13.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-7.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-8.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-7.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-8.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-13.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-7.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-8.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/union-1.c create mode 100644 gcc/testsuite/gcc.target/arm/cmse/mainline/union-2.c create mode 100644 libgcc/config/arm/cmse_nonsecure_call.S diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ce79fdd..807d406 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,6 +1,25 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * config/arm/arm.c (detect_cmse_nonsecure_call): New. + (cmse_nonsecure_call_clear_caller_saved): New. + (arm_reorg): Use cmse_nonsecure_call_clear_caller_saved. + (arm_function_ok_for_sibcall): Disable sibcalls for + cmse_nonsecure_call. + * config/arm/arm-protos.h (detect_cmse_nonsecure_call): New. + * config/arm/arm.md (call): Handle cmse_nonsecure_entry. + (call_value): Likewise. + (nonsecure_call_internal): New. + (nonsecure_call_value_internal): New. + * config/arm/thumb1.md (*nonsecure_call_reg_thumb1_v5): New. + (*nonsecure_call_value_reg_thumb1_v5): New. + * config/arm/thumb2.md (*nonsecure_call_reg_thumb2): New. + (*nonsecure_call_value_reg_thumb2): New. + * config/arm/unspecs.md (UNSPEC_NONSECURE_MEM): New. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * config/arm/arm.c (gimplify.h): New include. (arm_handle_cmse_nonsecure_call): New. (arm_attribute_table): Added cmse_nonsecure_call. diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 634a5de..05d73ab 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -137,6 +137,7 @@ extern int arm_const_double_inline_cost (rtx); extern bool arm_const_double_by_parts (rtx); extern bool arm_const_double_by_immediates (rtx); extern void arm_emit_call_insn (rtx, rtx, bool); +bool detect_cmse_nonsecure_call (tree); extern const char *output_call (rtx *); void arm_emit_movpair (rtx, rtx); extern const char *output_mov_long_double_arm_from_arm (rtx *); diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index a6b07b2..f1df3a0 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -7000,6 +7000,15 @@ arm_function_ok_for_sibcall (tree decl, tree exp) if (IS_CMSE_ENTRY (arm_current_func_type ())) return false; + /* We do not allow ARMv8-M non-secure calls to be turned into sibling calls, + this would complicate matters for later code generation. */ + if (TREE_CODE (exp) == CALL_EXPR) + { + tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp))); + if (lookup_attribute ("cmse_nonsecure_call", TYPE_ATTRIBUTES (fntype))) + return false; + } + if (!VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl)))) { /* Check that the return value locations are the same. For @@ -16654,6 +16663,197 @@ compute_not_to_clear_mask (tree arg_type, rtx arg_rtx, int regno, return not_to_clear_mask; } +/* Saves callee saved registers, clears callee saved registers and caller saved + registers not used to pass arguments before a cmse_nonsecure_call. And + restores the callee saved registers after. */ + +static void +cmse_nonsecure_call_clear_caller_saved (void) +{ + basic_block bb; + + FOR_EACH_BB_FN (bb, cfun) + { + rtx_insn *insn; + + FOR_BB_INSNS (bb, insn) + { + uint64_t to_clear_mask, float_mask; + rtx_insn *seq; + rtx pat, call, unspec, reg, cleared_reg, tmp; + unsigned int regno, maxregno; + rtx address; + CUMULATIVE_ARGS args_so_far_v; + cumulative_args_t args_so_far; + tree arg_type, fntype; + bool using_r4, first_param = true; + function_args_iterator args_iter; + uint32_t padding_bits_to_clear[4] = {0U, 0U, 0U, 0U}; + uint32_t * padding_bits_to_clear_ptr = &padding_bits_to_clear[0]; + + if (!NONDEBUG_INSN_P (insn)) + continue; + + if (!CALL_P (insn)) + continue; + + pat = PATTERN (insn); + gcc_assert (GET_CODE (pat) == PARALLEL && XVECLEN (pat, 0) > 0); + call = XVECEXP (pat, 0, 0); + + /* Get the real call RTX if the insn sets a value, ie. returns. */ + if (GET_CODE (call) == SET) + call = SET_SRC (call); + + /* Check if it is a cmse_nonsecure_call. */ + unspec = XEXP (call, 0); + if (GET_CODE (unspec) != UNSPEC + || XINT (unspec, 1) != UNSPEC_NONSECURE_MEM) + continue; + + /* Determine the caller-saved registers we need to clear. */ + to_clear_mask = (1LL << (NUM_ARG_REGS)) - 1; + maxregno = NUM_ARG_REGS - 1; + /* Only look at the caller-saved floating point registers in case of + -mfloat-abi=hard. For -mfloat-abi=softfp we will be using the + lazy store and loads which clear both caller- and callee-saved + registers. */ + if (TARGET_HARD_FLOAT_ABI) + { + float_mask = (1LL << (D7_VFP_REGNUM + 1)) - 1; + float_mask &= ~((1LL << FIRST_VFP_REGNUM) - 1); + to_clear_mask |= float_mask; + maxregno = D7_VFP_REGNUM; + } + + /* Make sure the register used to hold the function address is not + cleared. */ + address = RTVEC_ELT (XVEC (unspec, 0), 0); + gcc_assert (MEM_P (address)); + gcc_assert (REG_P (XEXP (address, 0))); + to_clear_mask &= ~(1LL << REGNO (XEXP (address, 0))); + + /* Set basic block of call insn so that df rescan is performed on + insns inserted here. */ + set_block_for_insn (insn, bb); + df_set_flags (DF_DEFER_INSN_RESCAN); + start_sequence (); + + /* Make sure the scheduler doesn't schedule other insns beyond + here. */ + emit_insn (gen_blockage ()); + + /* Walk through all arguments and clear registers appropriately. + */ + fntype = TREE_TYPE (MEM_EXPR (address)); + arm_init_cumulative_args (&args_so_far_v, fntype, NULL_RTX, + NULL_TREE); + args_so_far = pack_cumulative_args (&args_so_far_v); + FOREACH_FUNCTION_ARGS (fntype, arg_type, args_iter) + { + rtx arg_rtx; + machine_mode arg_mode = TYPE_MODE (arg_type); + + if (VOID_TYPE_P (arg_type)) + continue; + + if (!first_param) + arm_function_arg_advance (args_so_far, arg_mode, arg_type, + true); + + arg_rtx = arm_function_arg (args_so_far, arg_mode, arg_type, + true); + gcc_assert (REG_P (arg_rtx)); + to_clear_mask + &= ~compute_not_to_clear_mask (arg_type, arg_rtx, + REGNO (arg_rtx), + padding_bits_to_clear_ptr); + + first_param = false; + } + + /* Clear padding bits where needed. */ + cleared_reg = XEXP (address, 0); + reg = gen_rtx_REG (SImode, IP_REGNUM); + using_r4 = false; + for (regno = R0_REGNUM; regno < NUM_ARG_REGS; regno++) + { + if (padding_bits_to_clear[regno] == 0) + continue; + + /* If this is a Thumb-1 target copy the address of the function + we are calling from 'r4' into 'ip' such that we can use r4 to + clear the unused bits in the arguments. */ + if (TARGET_THUMB1 && !using_r4) + { + using_r4 = true; + reg = cleared_reg; + emit_move_insn (gen_rtx_REG (SImode, IP_REGNUM), + reg); + } + + tmp = GEN_INT ((((~padding_bits_to_clear[regno]) << 16u) >> 16u)); + emit_move_insn (reg, tmp); + /* Also fill the top half of the negated + padding_bits_to_clear. */ + if (((~padding_bits_to_clear[regno]) >> 16) > 0) + { + tmp = GEN_INT ((~padding_bits_to_clear[regno]) >> 16); + emit_insn (gen_rtx_SET (gen_rtx_ZERO_EXTRACT (SImode, reg, + GEN_INT (16), + GEN_INT (16)), + tmp)); + } + + emit_insn (gen_andsi3 (gen_rtx_REG (SImode, regno), + gen_rtx_REG (SImode, regno), + reg)); + + } + if (using_r4) + emit_move_insn (cleared_reg, + gen_rtx_REG (SImode, IP_REGNUM)); + + /* We use right shift and left shift to clear the LSB of the address + we jump to instead of using bic, to avoid having to use an extra + register on Thumb-1. */ + tmp = gen_rtx_LSHIFTRT (SImode, cleared_reg, const1_rtx); + emit_insn (gen_rtx_SET (cleared_reg, tmp)); + tmp = gen_rtx_ASHIFT (SImode, cleared_reg, const1_rtx); + emit_insn (gen_rtx_SET (cleared_reg, tmp)); + + /* Clearing all registers that leak before doing a non-secure + call. */ + for (regno = R0_REGNUM; regno <= maxregno; regno++) + { + if (!(to_clear_mask & (1LL << regno))) + continue; + + /* If regno is an even vfp register and its successor is also to + be cleared, use vmov. */ + if (IS_VFP_REGNUM (regno)) + { + if (TARGET_VFP_DOUBLE + && VFP_REGNO_OK_FOR_DOUBLE (regno) + && to_clear_mask & (1LL << (regno + 1))) + emit_move_insn (gen_rtx_REG (DFmode, regno++), + CONST0_RTX (DFmode)); + else + emit_move_insn (gen_rtx_REG (SFmode, regno), + CONST0_RTX (SFmode)); + } + else + emit_move_insn (gen_rtx_REG (SImode, regno), cleared_reg); + } + + seq = get_insns (); + end_sequence (); + emit_insn_before (seq, insn); + + } + } +} + /* Rewrite move insn into subtract of 0 if the condition codes will be useful in next conditional jump insn. */ @@ -16954,6 +17154,8 @@ arm_reorg (void) HOST_WIDE_INT address = 0; Mfix * fix; + if (use_cmse) + cmse_nonsecure_call_clear_caller_saved (); if (TARGET_THUMB1) thumb1_reorg (); else if (TARGET_THUMB2) @@ -17326,6 +17528,23 @@ vfp_emit_fstmd (int base_reg, int count) return count * 8; } +/* Returns true if -mcmse has been passed and the function pointed to by 'addr' + has the cmse_nonsecure_call attribute and returns false otherwise. */ + +bool +detect_cmse_nonsecure_call (tree addr) +{ + if (!addr) + return FALSE; + + tree fntype = TREE_TYPE (addr); + if (use_cmse && lookup_attribute ("cmse_nonsecure_call", + TYPE_ATTRIBUTES (fntype))) + return TRUE; + return FALSE; +} + + /* Emit a call instruction with pattern PAT. ADDR is the address of the call target. */ diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index 5523baf..d561a4b 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -8052,6 +8052,7 @@ " { rtx callee, pat; + tree addr = MEM_EXPR (operands[0]); /* In an untyped call, we can get NULL for operand 2. */ if (operands[2] == NULL_RTX) @@ -8066,8 +8067,17 @@ : !REG_P (callee)) XEXP (operands[0], 0) = force_reg (Pmode, callee); - pat = gen_call_internal (operands[0], operands[1], operands[2]); - arm_emit_call_insn (pat, XEXP (operands[0], 0), false); + if (detect_cmse_nonsecure_call (addr)) + { + pat = gen_nonsecure_call_internal (operands[0], operands[1], + operands[2]); + emit_call_insn (pat); + } + else + { + pat = gen_call_internal (operands[0], operands[1], operands[2]); + arm_emit_call_insn (pat, XEXP (operands[0], 0), false); + } DONE; }" ) @@ -8078,6 +8088,24 @@ (use (match_operand 2 "" "")) (clobber (reg:SI LR_REGNUM))])]) +(define_expand "nonsecure_call_internal" + [(parallel [(call (unspec:SI [(match_operand 0 "memory_operand" "")] + UNSPEC_NONSECURE_MEM) + (match_operand 1 "general_operand" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM)) + (clobber (reg:SI 4))])] + "use_cmse" + " + { + rtx tmp; + tmp = copy_to_suggested_reg (XEXP (operands[0], 0), + gen_rtx_REG (SImode, 4), + SImode); + + operands[0] = replace_equiv_address (operands[0], tmp); + }") + (define_insn "*call_reg_armv5" [(call (mem:SI (match_operand:SI 0 "s_register_operand" "r")) (match_operand 1 "" "")) @@ -8113,6 +8141,7 @@ " { rtx pat, callee; + tree addr = MEM_EXPR (operands[1]); /* In an untyped call, we can get NULL for operand 2. */ if (operands[3] == 0) @@ -8127,9 +8156,18 @@ : !REG_P (callee)) XEXP (operands[1], 0) = force_reg (Pmode, callee); - pat = gen_call_value_internal (operands[0], operands[1], - operands[2], operands[3]); - arm_emit_call_insn (pat, XEXP (operands[1], 0), false); + if (detect_cmse_nonsecure_call (addr)) + { + pat = gen_nonsecure_call_value_internal (operands[0], operands[1], + operands[2], operands[3]); + emit_call_insn (pat); + } + else + { + pat = gen_call_value_internal (operands[0], operands[1], + operands[2], operands[3]); + arm_emit_call_insn (pat, XEXP (operands[1], 0), false); + } DONE; }" ) @@ -8141,6 +8179,25 @@ (use (match_operand 3 "" "")) (clobber (reg:SI LR_REGNUM))])]) +(define_expand "nonsecure_call_value_internal" + [(parallel [(set (match_operand 0 "" "") + (call (unspec:SI [(match_operand 1 "memory_operand" "")] + UNSPEC_NONSECURE_MEM) + (match_operand 2 "general_operand" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM)) + (clobber (reg:SI 4))])] + "use_cmse" + " + { + rtx tmp; + tmp = copy_to_suggested_reg (XEXP (operands[1], 0), + gen_rtx_REG (SImode, 4), + SImode); + + operands[1] = replace_equiv_address (operands[1], tmp); + }") + (define_insn "*call_value_reg_armv5" [(set (match_operand 0 "" "") (call (mem:SI (match_operand:SI 1 "s_register_operand" "r")) diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md index 73a7381..f9e934f 100644 --- a/gcc/config/arm/thumb1.md +++ b/gcc/config/arm/thumb1.md @@ -1731,6 +1731,19 @@ (set_attr "type" "call")] ) +(define_insn "*nonsecure_call_reg_thumb1_v5" + [(call (unspec:SI [(mem:SI (match_operand:SI 0 "register_operand" "l*r"))] + UNSPEC_NONSECURE_MEM) + (match_operand 1 "" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM)) + (clobber (match_dup 0))] + "TARGET_THUMB1 && use_cmse && !SIBLING_CALL_P (insn)" + "bl\\t__gnu_cmse_nonsecure_call" + [(set_attr "length" "4") + (set_attr "type" "call")] +) + (define_insn "*call_reg_thumb1" [(call (mem:SI (match_operand:SI 0 "register_operand" "l*r")) (match_operand 1 "" "")) @@ -1763,6 +1776,21 @@ (set_attr "type" "call")] ) +(define_insn "*nonsecure_call_value_reg_thumb1_v5" + [(set (match_operand 0 "" "") + (call (unspec:SI + [(mem:SI (match_operand:SI 1 "register_operand" "l*r"))] + UNSPEC_NONSECURE_MEM) + (match_operand 2 "" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM)) + (clobber (match_dup 1))] + "TARGET_THUMB1 && use_cmse" + "bl\\t__gnu_cmse_nonsecure_call" + [(set_attr "length" "4") + (set_attr "type" "call")] +) + (define_insn "*call_value_reg_thumb1" [(set (match_operand 0 "" "") (call (mem:SI (match_operand:SI 1 "register_operand" "l*r")) diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index 9029a2f..9b078a5 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -580,6 +580,19 @@ [(set_attr "type" "call")] ) +(define_insn "*nonsecure_call_reg_thumb2" + [(call (unspec:SI [(mem:SI (match_operand:SI 0 "s_register_operand" "r"))] + UNSPEC_NONSECURE_MEM) + (match_operand 1 "" "")) + (use (match_operand 2 "" "")) + (clobber (reg:SI LR_REGNUM)) + (clobber (match_dup 0))] + "TARGET_THUMB2 && use_cmse" + "bl\\t__gnu_cmse_nonsecure_call" + [(set_attr "length" "4") + (set_attr "type" "call")] +) + (define_insn "*call_value_reg_thumb2" [(set (match_operand 0 "" "") (call (mem:SI (match_operand:SI 1 "register_operand" "l*r")) @@ -591,6 +604,21 @@ [(set_attr "type" "call")] ) +(define_insn "*nonsecure_call_value_reg_thumb2" + [(set (match_operand 0 "" "") + (call + (unspec:SI [(mem:SI (match_operand:SI 1 "register_operand" "l*r"))] + UNSPEC_NONSECURE_MEM) + (match_operand 2 "" ""))) + (use (match_operand 3 "" "")) + (clobber (reg:SI LR_REGNUM)) + (clobber (match_dup 1))] + "TARGET_THUMB2 && use_cmse" + "bl\t__gnu_cmse_nonsecure_call" + [(set_attr "length" "4") + (set_attr "type" "call")] +) + (define_insn "*thumb2_indirect_jump" [(set (pc) (match_operand:SI 0 "register_operand" "l*r"))] diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index bee8795..1aa39e8 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -84,6 +84,8 @@ UNSPEC_VRINTA ; Represent a float to integral float rounding ; towards nearest, ties away from zero. UNSPEC_PROBE_STACK ; Probe stack memory reference + UNSPEC_NONSECURE_MEM ; Represent non-secure memory in ARMv8-M with + ; security extension ]) (define_c_enum "unspec" [ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index ca431e4..e9786b7 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,6 +1,48 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * gcc.target/arm/cmse/cmse.exp: Run tests in mainline dir. + * gcc.target/arm/cmse/cmse-9.c: Added some extra tests. + * gcc.target/arm/cmse/cmse-14.c: New. + * gcc.target/arm/cmse/baseline/bitfield-4.c: New. + * gcc.target/arm/cmse/baseline/bitfield-5.c: New. + * gcc.target/arm/cmse/baseline/bitfield-6.c: New. + * gcc.target/arm/cmse/baseline/bitfield-7.c: New. + * gcc.target/arm/cmse/baseline/bitfield-8.c: New. + * gcc.target/arm/cmse/baseline/bitfield-9.c: New. + * gcc.target/arm/cmse/baseline/bitfield-and-union-1.c: New. + * gcc.target/arm/cmse/baseline/cmse-11.c: New. + * gcc.target/arm/cmse/baseline/cmse-13.c: New. + * gcc.target/arm/cmse/baseline/cmse-6.c: New. + * gcc.target/arm/cmse/baseline/union-1.c: New. + * gcc.target/arm/cmse/baseline/union-2.c: New. + * gcc.target/arm/cmse/mainline/bitfield-4.c: New. + * gcc.target/arm/cmse/mainline/bitfield-5.c: New. + * gcc.target/arm/cmse/mainline/bitfield-6.c: New. + * gcc.target/arm/cmse/mainline/bitfield-7.c: New. + * gcc.target/arm/cmse/mainline/bitfield-8.c: New. + * gcc.target/arm/cmse/mainline/bitfield-9.c: New. + * gcc.target/arm/cmse/mainline/bitfield-and-union-1.c: New. + * gcc.target/arm/cmse/mainline/union-1.c: New. + * gcc.target/arm/cmse/mainline/union-2.c: New. + * gcc.target/arm/cmse/mainline/hard-sp/cmse-13.c: New. + * gcc.target/arm/cmse/mainline/hard-sp/cmse-7.c: New. + * gcc.target/arm/cmse/mainline/hard-sp/cmse-8.c: New. + * gcc.target/arm/cmse/mainline/hard/cmse-13.c: New. + * gcc.target/arm/cmse/mainline/hard/cmse-7.c: New. + * gcc.target/arm/cmse/mainline/hard/cmse-8.c: New. + * gcc.target/arm/cmse/mainline/soft/cmse-13.c: New. + * gcc.target/arm/cmse/mainline/soft/cmse-7.c: New. + * gcc.target/arm/cmse/mainline/soft/cmse-8.c: New. + * gcc.target/arm/cmse/mainline/softfp-sp/cmse-7.c: New. + * gcc.target/arm/cmse/mainline/softfp-sp/cmse-8.c: New. + * gcc.target/arm/cmse/mainline/softfp/cmse-13.c: New. + * gcc.target/arm/cmse/mainline/softfp/cmse-7.c: New. + * gcc.target/arm/cmse/mainline/softfp/cmse-8.c: New. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * gcc.target/arm/cmse/cmse-3.c: Add tests. * gcc.target/arm/cmse/cmse-4.c: Add tests. * gcc.target/arm/cmse/cmse-15.c: New. diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-4.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-4.c new file mode 100644 index 0000000..a6c1386 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-4.c @@ -0,0 +1,57 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a; + unsigned int b:5; + unsigned int c:11, :0, d:8; + struct { unsigned int ee:2; } e; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +extern void foo (test_st st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + r.values.v3 = 0xFFFFFFFF; + r.values.v4 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + +/* { dg-final { scan-assembler "mov\tip, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #65535" } } */ +/* { dg-final { scan-assembler "movt\tr4, 255" } } */ +/* { dg-final { scan-assembler "ands\tr0, r4" } } */ +/* { dg-final { scan-assembler "movs\tr4, #255" } } */ +/* { dg-final { scan-assembler "ands\tr1, r4" } } */ +/* { dg-final { scan-assembler "movs\tr4, #3" } } */ +/* { dg-final { scan-assembler "ands\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr4, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "movs\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-5.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-5.c new file mode 100644 index 0000000..d51ce2d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-5.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a; + unsigned short b :5; + unsigned char c; + unsigned short d :11; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + +/* { dg-final { scan-assembler "mov\tip, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #8191" } } */ +/* { dg-final { scan-assembler "movt\tr4, 255" } } */ +/* { dg-final { scan-assembler "ands\tr0, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #2047" } } */ +/* { dg-final { scan-assembler "ands\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr4, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "movs\tr2, r4" } } */ +/* { dg-final { scan-assembler "movs\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-6.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-6.c new file mode 100644 index 0000000..77e9104 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-6.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a; + unsigned int b : 3; + unsigned int c : 14; + unsigned int d : 1; + struct { + unsigned int ee : 2; + unsigned short ff : 15; + } e; + unsigned char g : 1; + unsigned char : 4; + unsigned char h : 3; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + r.values.v3 = 0xFFFFFFFF; + r.values.v4 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + +/* { dg-final { scan-assembler "mov\tip, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #65535" } } */ +/* { dg-final { scan-assembler "movt\tr4, 1023" } } */ +/* { dg-final { scan-assembler "ands\tr0, r4" } } */ +/* { dg-final { scan-assembler "movs\tr4, #3" } } */ +/* { dg-final { scan-assembler "movt\tr4, 32767" } } */ +/* { dg-final { scan-assembler "ands\tr1, r4" } } */ +/* { dg-final { scan-assembler "movs\tr4, #255" } } */ +/* { dg-final { scan-assembler "ands\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr4, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "movs\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-7.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-7.c new file mode 100644 index 0000000..3d8941b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-7.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a; + unsigned short b :5; + unsigned char c; + unsigned short d :11; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + + +/* { dg-final { scan-assembler "mov\tip, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #8191" } } */ +/* { dg-final { scan-assembler "movt\tr4, 255" } } */ +/* { dg-final { scan-assembler "ands\tr0, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #2047" } } */ +/* { dg-final { scan-assembler "ands\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr4, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "movs\tr2, r4" } } */ +/* { dg-final { scan-assembler "movs\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-8.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-8.c new file mode 100644 index 0000000..9ffbb71 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-8.c @@ -0,0 +1,57 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a; + unsigned int :0; + unsigned int b :1; + unsigned short :0; + unsigned short c; + unsigned int :0; + unsigned int d :21; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + r.values.v3 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + +/* { dg-final { scan-assembler "mov\tip, r4" } } */ +/* { dg-final { scan-assembler "movs\tr4, #255" } } */ +/* { dg-final { scan-assembler "ands\tr0, r4" } } */ +/* { dg-final { scan-assembler "movs\tr4, #1" } } */ +/* { dg-final { scan-assembler "movt\tr4, 65535" } } */ +/* { dg-final { scan-assembler "ands\tr1, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #65535" } } */ +/* { dg-final { scan-assembler "movt\tr4, 31" } } */ +/* { dg-final { scan-assembler "ands\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr4, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "movs\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-9.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-9.c new file mode 100644 index 0000000..8a61418 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-9.c @@ -0,0 +1,56 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + char a:3; +} test_st3; + +typedef struct +{ + char a:3; +} test_st2; + +typedef struct +{ + test_st2 st2; + test_st3 st3; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + +/* { dg-final { scan-assembler "mov\tip, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #1799" } } */ +/* { dg-final { scan-assembler "ands\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr4, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "movs\tr1, r4" } } */ +/* { dg-final { scan-assembler "movs\tr2, r4" } } */ +/* { dg-final { scan-assembler "movs\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-and-union-1.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-and-union-1.c new file mode 100644 index 0000000..642f4e0 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/bitfield-and-union-1.c @@ -0,0 +1,96 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned short a :11; +} test_st_4; + +typedef union +{ + char a; + test_st_4 st4; +}test_un_2; + +typedef struct +{ + unsigned char a; + unsigned int :0; + unsigned int b :1; + unsigned short :0; + unsigned short c; + unsigned int :0; + unsigned int d :21; +} test_st_3; + +typedef struct +{ + unsigned char a :3; + unsigned int b :13; + test_un_2 un2; +} test_st_2; + +typedef union +{ + test_st_2 st2; + test_st_3 st3; +}test_un_1; + +typedef struct +{ + unsigned char a :2; + unsigned char :0; + unsigned short b :5; + unsigned char :0; + unsigned char c :4; + test_un_1 un1; +} test_st_1; + +typedef union +{ + test_st_1 st1; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st_1; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st_1); + +int +main (void) +{ + read_st_1 r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + r.values.v3 = 0xFFFFFFFF; + r.values.v4 = 0xFFFFFFFF; + + f (r.st1); + return 0; +} + +/* { dg-final { scan-assembler "mov\tip, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #7939" } } */ +/* { dg-final { scan-assembler "movt\tr4, 15" } } */ +/* { dg-final { scan-assembler "ands\tr0, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #65535" } } */ +/* { dg-final { scan-assembler "movt\tr4, 2047" } } */ +/* { dg-final { scan-assembler "ands\tr1, r4" } } */ +/* { dg-final { scan-assembler "movs\tr4, #1" } } */ +/* { dg-final { scan-assembler "movt\tr4, 65535" } } */ +/* { dg-final { scan-assembler "ands\tr2, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #65535" } } */ +/* { dg-final { scan-assembler "movt\tr4, 31" } } */ +/* { dg-final { scan-assembler "ands\tr3, r4" } } */ +/* { dg-final { scan-assembler "mov\tr4, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-11.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-11.c new file mode 100644 index 0000000..3007409 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-11.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_base_ok } */ +/* { dg-add-options arm_arch_v8m_base } */ +/* { dg-options "-mcmse" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (int); + +int +foo (int a) +{ + return bar (bar (a + 1)); +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "movs\tr1, r4" } } */ +/* { dg-final { scan-assembler "movs\tr2, r4" } } */ +/* { dg-final { scan-assembler "movs\tr3, r4" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-13.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-13.c new file mode 100644 index 0000000..f2b931b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-13.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_base_ok } */ +/* { dg-add-options arm_arch_v8m_base } */ +/* { dg-options "-mcmse" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double); + +int +foo (int a) +{ + return bar (1.0f, 2.0) + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler-not "movs\tr0, r4" } } */ +/* { dg-final { scan-assembler "\n\tmovs\tr1, r4" } } */ +/* { dg-final { scan-assembler-not "\n\tmovs\tr2, r4\n\tmovs\tr3, r4" } } */ +/* { dg-final { scan-assembler-not "vmov" } } */ +/* { dg-final { scan-assembler-not "vmsr" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-6.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-6.c new file mode 100644 index 0000000..95da045 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/cmse-6.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_base_ok } */ +/* { dg-add-options arm_arch_v8m_base } */ +/* { dg-options "-mcmse" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (double); + +int +foo (int a) +{ + return bar (2.0) + a + 1; +} + +/* Remember dont clear r0 and r1, because we are passing the double parameter + * for bar in them. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "movs\tr2, r4" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/union-1.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/union-1.c new file mode 100644 index 0000000..ff18e83 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/union-1.c @@ -0,0 +1,71 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a :2; + unsigned char :0; + unsigned short b :5; + unsigned char :0; + unsigned short c :3; + unsigned char :0; + unsigned int d :9; +} test_st_1; + +typedef struct +{ + unsigned short a :7; + unsigned char :0; + unsigned char b :1; + unsigned char :0; + unsigned short c :6; +} test_st_2; + +typedef union +{ + test_st_1 st_1; + test_st_2 st_2; +}test_un; + +typedef union +{ + test_un un; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_un; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un); + +int +main (void) +{ + read_un r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + + f (r.un); + return 0; +} + +/* { dg-final { scan-assembler "mov\tip, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #8063" } } */ +/* { dg-final { scan-assembler "movt\tr4, 63" } } */ +/* { dg-final { scan-assembler "ands\tr0, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #511" } } */ +/* { dg-final { scan-assembler "ands\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr4, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "movs\tr2, r4" } } */ +/* { dg-final { scan-assembler "movs\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/baseline/union-2.c b/gcc/testsuite/gcc.target/arm/cmse/baseline/union-2.c new file mode 100644 index 0000000..b2e024b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/baseline/union-2.c @@ -0,0 +1,86 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a :2; + unsigned char :0; + unsigned short b :5; + unsigned char :0; + unsigned short c :3; + unsigned char :0; + unsigned int d :9; +} test_st_1; + +typedef struct +{ + unsigned short a :7; + unsigned char :0; + unsigned char b :1; + unsigned char :0; + unsigned short c :6; +} test_st_2; + +typedef struct +{ + unsigned char a; + unsigned int :0; + unsigned int b :1; + unsigned short :0; + unsigned short c; + unsigned int :0; + unsigned int d :21; +} test_st_3; + +typedef union +{ + test_st_1 st_1; + test_st_2 st_2; + test_st_3 st_3; +}test_un; + +typedef union +{ + test_un un; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_un; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un); + +int +main (void) +{ + read_un r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + r.values.v3 = 0xFFFFFFFF; + + f (r.un); + return 0; +} + +/* { dg-final { scan-assembler "mov\tip, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #8191" } } */ +/* { dg-final { scan-assembler "movt\tr4, 63" } } */ +/* { dg-final { scan-assembler "ands\tr0, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #511" } } */ +/* { dg-final { scan-assembler "movt\tr4, 65535" } } */ +/* { dg-final { scan-assembler "ands\tr1, r4" } } */ +/* { dg-final { scan-assembler "movw\tr4, #65535" } } */ +/* { dg-final { scan-assembler "movt\tr4, 31" } } */ +/* { dg-final { scan-assembler "ands\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr4, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "movs\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-14.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-14.c new file mode 100644 index 0000000..701e9ee --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-14.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (void); + +int foo (void) +{ + return bar (); +} + +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ +/* { dg-final { scan-assembler-not "b\[^ y\n\]*\\s+bar" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-9.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-9.c index 1d97f0e..9e81e30 100644 --- a/gcc/testsuite/gcc.target/arm/cmse/cmse-9.c +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-9.c @@ -2,11 +2,19 @@ /* { dg-skip-if "Testing exclusion of -mcmse" { arm-*-* } { "-mcmse" } { "" } } */ +void __attribute__ ((cmse_nonsecure_call)) (*bar) (int); /* { dg-warning "attribute ignored without -mcmse option" } */ +typedef void __attribute__ ((cmse_nonsecure_call)) baz (int); /* { dg-warning "attribute ignored without -mcmse option" } */ + int __attribute__ ((cmse_nonsecure_entry)) -foo (int a) +foo (int a, baz b) { /* { dg-warning "attribute ignored without -mcmse option" } */ + bar (a); + b (a); return a + 1; } +/* { dg-final { scan-assembler-not "bxns" } } */ +/* { dg-final { scan-assembler-not "blxns" } } */ +/* { dg-final { scan-assembler-not "bl\t__gnu_cmse_nonsecure_call" } } */ /* { dg-final { scan-assembler "foo:" } } */ /* { dg-final { scan-assembler-not "__acle_se_foo:" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse.exp b/gcc/testsuite/gcc.target/arm/cmse/cmse.exp index 38f1841..66a8b7d 100644 --- a/gcc/testsuite/gcc.target/arm/cmse/cmse.exp +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse.exp @@ -50,6 +50,8 @@ if {[check_effective_target_arm_arch_v8m_base_ok]} then { } if {[check_effective_target_arm_arch_v8m_main_ok]} then { + gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/*.c]] \ + "" $DEFAULT_CFLAGS # Mainline -mfloat-abi=soft gcc-dg-runtest [lsort [glob $srcdir/$subdir/mainline/soft/*.c]] \ "-mfloat-abi=soft" $DEFAULT_CFLAGS diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-4.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-4.c new file mode 100644 index 0000000..c3b1396 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-4.c @@ -0,0 +1,55 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a; + unsigned int b:5; + unsigned int c:11, :0, d:8; + struct { unsigned int ee:2; } e; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +extern void foo (test_st st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + r.values.v3 = 0xFFFFFFFF; + r.values.v4 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + +/* { dg-final { scan-assembler "movw\tip, #65535" } } */ +/* { dg-final { scan-assembler "movt\tip, 255" } } */ +/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */ +/* { dg-final { scan-assembler "mov\tip, #255" } } */ +/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */ +/* { dg-final { scan-assembler "mov\tip, #3" } } */ +/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-5.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-5.c new file mode 100644 index 0000000..0d02904 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-5.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a; + unsigned short b :5; + unsigned char c; + unsigned short d :11; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + +/* { dg-final { scan-assembler "movw\tip, #8191" } } */ +/* { dg-final { scan-assembler "movt\tip, 255" } } */ +/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */ +/* { dg-final { scan-assembler "movw\tip, #2047" } } */ +/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-6.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-6.c new file mode 100644 index 0000000..005515a --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-6.c @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a; + unsigned int b : 3; + unsigned int c : 14; + unsigned int d : 1; + struct { + unsigned int ee : 2; + unsigned short ff : 15; + } e; + unsigned char g : 1; + unsigned char : 4; + unsigned char h : 3; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + r.values.v3 = 0xFFFFFFFF; + r.values.v4 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + +/* { dg-final { scan-assembler "movw\tip, #65535" } } */ +/* { dg-final { scan-assembler "movt\tip, 1023" } } */ +/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */ +/* { dg-final { scan-assembler "mov\tip, #3" } } */ +/* { dg-final { scan-assembler "movt\tip, 32767" } } */ +/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */ +/* { dg-final { scan-assembler "mov\tip, #255" } } */ +/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-7.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-7.c new file mode 100644 index 0000000..6dd218e --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-7.c @@ -0,0 +1,52 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a; + unsigned short b :5; + unsigned char c; + unsigned short d :11; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + + +/* { dg-final { scan-assembler "movw\tip, #8191" } } */ +/* { dg-final { scan-assembler "movt\tip, 255" } } */ +/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */ +/* { dg-final { scan-assembler "movw\tip, #2047" } } */ +/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-8.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-8.c new file mode 100644 index 0000000..c833bcb --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-8.c @@ -0,0 +1,55 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a; + unsigned int :0; + unsigned int b :1; + unsigned short :0; + unsigned short c; + unsigned int :0; + unsigned int d :21; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + r.values.v3 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + +/* { dg-final { scan-assembler "mov\tip, #255" } } */ +/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */ +/* { dg-final { scan-assembler "mov\tip, #1" } } */ +/* { dg-final { scan-assembler "movt\tip, 65535" } } */ +/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */ +/* { dg-final { scan-assembler "movw\tip, #65535" } } */ +/* { dg-final { scan-assembler "movt\tip, 31" } } */ +/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-9.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-9.c new file mode 100644 index 0000000..d6e4cdb --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-9.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + char a:3; +} test_st3; + +typedef struct +{ + char a:3; +} test_st2; + +typedef struct +{ + test_st2 st2; + test_st3 st3; +} test_st; + +typedef union +{ + test_st st; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st; + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st); + +int +main (void) +{ + read_st r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + + f (r.st); + return 0; +} + +/* { dg-final { scan-assembler "movw\tip, #1799" } } */ +/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-and-union-1.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-and-union-1.c new file mode 100644 index 0000000..e139ba6 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/bitfield-and-union-1.c @@ -0,0 +1,94 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned short a :11; +} test_st_4; + +typedef union +{ + char a; + test_st_4 st4; +}test_un_2; + +typedef struct +{ + unsigned char a; + unsigned int :0; + unsigned int b :1; + unsigned short :0; + unsigned short c; + unsigned int :0; + unsigned int d :21; +} test_st_3; + +typedef struct +{ + unsigned char a :3; + unsigned int b :13; + test_un_2 un2; +} test_st_2; + +typedef union +{ + test_st_2 st2; + test_st_3 st3; +}test_un_1; + +typedef struct +{ + unsigned char a :2; + unsigned char :0; + unsigned short b :5; + unsigned char :0; + unsigned char c :4; + test_un_1 un1; +} test_st_1; + +typedef union +{ + test_st_1 st1; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_st_1; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_st_1); + +int +main (void) +{ + read_st_1 r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + r.values.v3 = 0xFFFFFFFF; + r.values.v4 = 0xFFFFFFFF; + + f (r.st1); + return 0; +} + +/* { dg-final { scan-assembler "movw\tip, #7939" } } */ +/* { dg-final { scan-assembler "movt\tip, 15" } } */ +/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */ +/* { dg-final { scan-assembler "movw\tip, #65535" } } */ +/* { dg-final { scan-assembler "movt\tip, 2047" } } */ +/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */ +/* { dg-final { scan-assembler "mov\tip, #1" } } */ +/* { dg-final { scan-assembler "movt\tip, 65535" } } */ +/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */ +/* { dg-final { scan-assembler "movw\tip, #65535" } } */ +/* { dg-final { scan-assembler "movt\tip, 31" } } */ +/* { dg-final { scan-assembler "and\tr3, r3, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-13.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-13.c new file mode 100644 index 0000000..d90ad81 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-13.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */ +/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" } */ + + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double); + +int +foo (int a) +{ + return bar (3.0f, 2.0) + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */ +/* { dg-final { scan-assembler-not "vldr\.32\ts2, .L" } } */ +/* { dg-final { scan-assembler-not "vldr\.32\ts3, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-7.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-7.c new file mode 100644 index 0000000..c047cd5 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-7.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */ +/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (void); + +int +foo (int a) +{ + return bar () + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts0, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts2, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts3, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-8.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-8.c new file mode 100644 index 0000000..20d2d4a --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard-sp/cmse-8.c @@ -0,0 +1,41 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */ +/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-sp-d16" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (double); + +int +foo (int a) +{ + return bar (2.0) + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */ +/* { dg-final { scan-assembler-not "vldr\.32\ts1, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts2, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts3, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts4, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts5, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts6, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts7, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts8, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts9, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts10, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts11, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts12, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts13, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts14, .L" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts15, .L" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-13.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-13.c new file mode 100644 index 0000000..0af586a --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-13.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */ +/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" } */ + + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double); + +int +foo (int a) +{ + return bar (3.0f, 2.0) + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "vldr\.32\ts1, .L" } } */ +/* { dg-final { scan-assembler-not "vldr\.64\td0, .L" } } */ +/* { dg-final { scan-assembler-not "vldr\.32\ts0, .L" } } */ +/* { dg-final { scan-assembler-not "vldr\.64\td1, .L" } } */ +/* { dg-final { scan-assembler-not "vldr\.32\ts2, .L" } } */ +/* { dg-final { scan-assembler-not "vldr\.32\ts3, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-7.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-7.c new file mode 100644 index 0000000..a5c64fb --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-7.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */ +/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (void); + +int +foo (int a) +{ + return bar () + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "vldr\.64\td0, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td1, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-8.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-8.c new file mode 100644 index 0000000..5e041b1 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/hard/cmse-8.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=softfp } {""} } */ +/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=hard -mfpu=fpv5-d16" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (double); + +int +foo (int a) +{ + return bar (2.0) + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler-not "vldr\.64\td0, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td1, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td2, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td3, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td4, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td5, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td6, .L" } } */ +/* { dg-final { scan-assembler "vldr\.64\td7, .L" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-13.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-13.c new file mode 100644 index 0000000..dbbd262 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-13.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */ +/* { dg-options "-mcmse -mfloat-abi=soft" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double); + +int +foo (int a) +{ + return bar (1.0f, 2.0) + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler-not "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler-not "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler-not "vmov" } } */ +/* { dg-final { scan-assembler-not "vmsr" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-7.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-7.c new file mode 100644 index 0000000..e335684 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-7.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */ +/* { dg-options "-mcmse -mfloat-abi=soft" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (void); + +int +foo (int a) +{ + return bar () + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler-not "vmov" } } */ +/* { dg-final { scan-assembler-not "vmsr" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-8.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-8.c new file mode 100644 index 0000000..024a12e --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/soft/cmse-8.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=hard" -mfloat-abi=softfp } {""} } */ +/* { dg-options "-mcmse -mfloat-abi=soft" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (double); + +int +foo (int a) +{ + return bar (2.0) + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler-not "vmov" } } */ +/* { dg-final { scan-assembler-not "vmsr" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-7.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-7.c new file mode 100644 index 0000000..fb195eb --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-7.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */ +/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (void); + +int +foo (int a) +{ + return bar () + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-8.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-8.c new file mode 100644 index 0000000..22ed3f8 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp-sp/cmse-8.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */ +/* { dg-skip-if "Skip these if testing double precision" {*-*-*} {"-mfpu=fpv[4-5]-d16"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-sp-d16" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (double); + +int +foo (int a) +{ + return bar (2.0) + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-13.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-13.c new file mode 100644 index 0000000..9634065 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-13.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */ +/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (float, double); + +int +foo (int a) +{ + return bar (1.0f, 2.0) + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "\n\tmov\tr1, r4" } } */ +/* { dg-final { scan-assembler-not "\n\tmov\tr2, r4\n\tmov\tr3, r4" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-7.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-7.c new file mode 100644 index 0000000..04f8466 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-7.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */ +/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (void); + +int +foo (int a) +{ + return bar () + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-8.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-8.c new file mode 100644 index 0000000..ffe94de --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/softfp/cmse-8.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_arch_v8m_main_ok } */ +/* { dg-add-options arm_arch_v8m_main } */ +/* { dg-skip-if "Do not combine float-abi= hard | soft | softfp" {*-*-*} {"-mfloat-abi=soft" -mfloat-abi=hard } {""} } */ +/* { dg-skip-if "Skip these if testing single precision" {*-*-*} {"-mfpu=*-sp-*"} {""} } */ +/* { dg-options "-mcmse -mfloat-abi=softfp -mfpu=fpv5-d16" } */ + +int __attribute__ ((cmse_nonsecure_call)) (*bar) (double); + +int +foo (int a) +{ + return bar (2.0) + a + 1; +} + +/* Checks for saving and clearing prior to function call. */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler-not "mov\tr0, r4" } } */ +/* { dg-final { scan-assembler-not "mov\tr1, r4" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ + +/* Now we check that we use the correct intrinsic to call. */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/union-1.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/union-1.c new file mode 100644 index 0000000..1fc846c --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/union-1.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a :2; + unsigned char :0; + unsigned short b :5; + unsigned char :0; + unsigned short c :3; + unsigned char :0; + unsigned int d :9; +} test_st_1; + +typedef struct +{ + unsigned short a :7; + unsigned char :0; + unsigned char b :1; + unsigned char :0; + unsigned short c :6; +} test_st_2; + +typedef union +{ + test_st_1 st_1; + test_st_2 st_2; +}test_un; + +typedef union +{ + test_un un; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_un; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un); + +int +main (void) +{ + read_un r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + + f (r.un); + return 0; +} + +/* { dg-final { scan-assembler "movw\tip, #8063" } } */ +/* { dg-final { scan-assembler "movt\tip, 63" } } */ +/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */ +/* { dg-final { scan-assembler "movw\tip, #511" } } */ +/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr2, r4" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ + diff --git a/gcc/testsuite/gcc.target/arm/cmse/mainline/union-2.c b/gcc/testsuite/gcc.target/arm/cmse/mainline/union-2.c new file mode 100644 index 0000000..420d0f1 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/cmse/mainline/union-2.c @@ -0,0 +1,84 @@ +/* { dg-do compile } */ +/* { dg-options "-mcmse" } */ + +typedef struct +{ + unsigned char a :2; + unsigned char :0; + unsigned short b :5; + unsigned char :0; + unsigned short c :3; + unsigned char :0; + unsigned int d :9; +} test_st_1; + +typedef struct +{ + unsigned short a :7; + unsigned char :0; + unsigned char b :1; + unsigned char :0; + unsigned short c :6; +} test_st_2; + +typedef struct +{ + unsigned char a; + unsigned int :0; + unsigned int b :1; + unsigned short :0; + unsigned short c; + unsigned int :0; + unsigned int d :21; +} test_st_3; + +typedef union +{ + test_st_1 st_1; + test_st_2 st_2; + test_st_3 st_3; +}test_un; + +typedef union +{ + test_un un; + struct + { + unsigned int v1; + unsigned int v2; + unsigned int v3; + unsigned int v4; + }values; +} read_un; + + +typedef void __attribute__ ((cmse_nonsecure_call)) (*foo_ns) (test_un); + +int +main (void) +{ + read_un r; + foo_ns f; + + f = (foo_ns) 0x200000; + r.values.v1 = 0xFFFFFFFF; + r.values.v2 = 0xFFFFFFFF; + r.values.v3 = 0xFFFFFFFF; + + f (r.un); + return 0; +} + +/* { dg-final { scan-assembler "movw\tip, #8191" } } */ +/* { dg-final { scan-assembler "movt\tip, 63" } } */ +/* { dg-final { scan-assembler "and\tr0, r0, ip" } } */ +/* { dg-final { scan-assembler "movw\tip, #511" } } */ +/* { dg-final { scan-assembler "movt\tip, 65535" } } */ +/* { dg-final { scan-assembler "and\tr1, r1, ip" } } */ +/* { dg-final { scan-assembler "movw\tip, #65535" } } */ +/* { dg-final { scan-assembler "movt\tip, 31" } } */ +/* { dg-final { scan-assembler "and\tr2, r2, ip" } } */ +/* { dg-final { scan-assembler "lsrs\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "lsls\tr4, r4, #1" } } */ +/* { dg-final { scan-assembler "mov\tr3, r4" } } */ +/* { dg-final { scan-assembler "bl\t__gnu_cmse_nonsecure_call" } } */ diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog index f323f43..6627d26 100644 --- a/libgcc/ChangeLog +++ b/libgcc/ChangeLog @@ -1,6 +1,12 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * config/arm/cmse_nonsecure_call.S: New. + * config/arm/t-arm: Compile cmse_nonsecure_call.S + +2016-12-02 Andre Vieira + Thomas Preud'homme + * config/arm/t-arm (HAVE_CMSE): New. * config/arm/cmse.c: New. diff --git a/libgcc/config/arm/cmse_nonsecure_call.S b/libgcc/config/arm/cmse_nonsecure_call.S new file mode 100644 index 0000000..68b6a1c --- /dev/null +++ b/libgcc/config/arm/cmse_nonsecure_call.S @@ -0,0 +1,131 @@ +/* CMSE wrapper function used to save, clear and restore callee saved registers + for cmse_nonsecure_call's. + + Copyright (C) 2016 Free Software Foundation, Inc. + Contributed by ARM Ltd. + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +.syntax unified +.thumb +.global __gnu_cmse_nonsecure_call +__gnu_cmse_nonsecure_call: +#if defined(__ARM_ARCH_8M_MAIN__) +push {r5-r11,lr} +mov r7, r4 +mov r8, r4 +mov r9, r4 +mov r10, r4 +mov r11, r4 +mov ip, r4 + +/* Save and clear callee-saved registers only if we are dealing with hard float + ABI. The unused caller-saved registers have already been cleared by GCC + generated code. */ +#ifdef __ARM_PCS_VFP +vpush.f64 {d8-d15} +mov r5, #0 +vmov d8, r5, r5 +#if __ARM_FP & 0x04 +vmov s18, s19, r5, r5 +vmov s20, s21, r5, r5 +vmov s22, s23, r5, r5 +vmov s24, s25, r5, r5 +vmov s26, s27, r5, r5 +vmov s28, s29, r5, r5 +vmov s30, s31, r5, r5 +#elif __ARM_FP & 0x08 +vmov.f64 d9, d8 +vmov.f64 d10, d8 +vmov.f64 d11, d8 +vmov.f64 d12, d8 +vmov.f64 d13, d8 +vmov.f64 d14, d8 +vmov.f64 d15, d8 +#else +#error "Half precision implementation not supported." +#endif +/* Clear the cumulative exception-status bits (0-4,7) and the + condition code bits (28-31) of the FPSCR. */ +vmrs r5, fpscr +movw r6, #65376 +movt r6, #4095 +ands r5, r6 +vmsr fpscr, r5 + +/* We are not dealing with hard float ABI, so we can safely use the vlstm and + vlldm instructions without needing to preserve the registers used for + argument passing. */ +#else +sub sp, sp, #0x88 /* Reserve stack space to save all floating point + registers, including FPSCR. */ +vlstm sp /* Lazy store and clearance of d0-d16 and FPSCR. */ +#endif /* __ARM_PCS_VFP */ + +/* Make sure to clear the 'GE' bits of the APSR register if 32-bit SIMD + instructions are available. */ +#if defined(__ARM_FEATURE_SIMD32) +msr APSR_nzcvqg, r4 +#else +msr APSR_nzcvq, r4 +#endif + +mov r5, r4 +mov r6, r4 +blxns r4 + +#ifdef __ARM_PCS_VFP +vpop.f64 {d8-d15} +#else +vlldm sp /* Lazy restore of d0-d16 and FPSCR. */ +add sp, sp, #0x88 /* Free space used to save floating point registers. */ +#endif /* __ARM_PCS_VFP */ + +pop {r5-r11, pc} + +#elif defined (__ARM_ARCH_8M_BASE__) +push {r5-r7, lr} +mov r5, r8 +mov r6, r9 +mov r7, r10 +push {r5-r7} +mov r5, r11 +push {r5} +mov r5, r4 +mov r6, r4 +mov r7, r4 +mov r8, r4 +mov r9, r4 +mov r10, r4 +mov r11, r4 +mov ip, r4 +msr APSR_nzcvq, r4 +blxns r4 +pop {r5} +mov r11, r5 +pop {r5-r7} +mov r10, r7 +mov r9, r6 +mov r8, r5 +pop {r5-r7, pc} + +#else +#error "This should only be used for armv8-m base- and mainline." +#endif diff --git a/libgcc/config/arm/t-arm b/libgcc/config/arm/t-arm index 5618143..9e85ac0 100644 --- a/libgcc/config/arm/t-arm +++ b/libgcc/config/arm/t-arm @@ -12,4 +12,6 @@ libgcc-objects += cmse.o cmse_nonsecure_call.o cmse.o: $(srcdir)/config/arm/cmse.c $(gcc_compile) -c $(CMSE_OPTS) $< +cmse_nonsecure_call.o: $(srcdir)/config/arm/cmse_nonsecure_call.S + $(gcc_compile) -c $< endif -- cgit v1.1 From 8261e476cb64cee8891fa676202d1f42decdcd14 Mon Sep 17 00:00:00 2001 From: Andre Vieira Date: Fri, 2 Dec 2016 15:34:36 +0000 Subject: Added support for ARMV8-M Security Extension cmse_nonsecure_caller intrinsic gcc/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * config/arm/arm-builtins.c (arm_builtins): Define ARM_BUILTIN_CMSE_NONSECURE_CALLER. (bdesc_2arg): Add line for cmse_nonsecure_caller. (arm_init_builtins): Handle cmse_nonsecure_caller. (arm_expand_builtin): Likewise. * config/arm/arm_cmse.h (cmse_nonsecure_caller): New. gcc/testsuite/ChangeLog: 2016-12-02 Andre Vieira Thomas Preud'homme * gcc.target/arm/cmse/cmse-1.c: Add test for cmse_nonsecure_caller. Co-Authored-By: Thomas Preud'homme From-SVN: r243193 --- gcc/ChangeLog | 10 ++++++++ gcc/config/arm/arm-builtins.c | 19 +++++++++++++++ gcc/config/arm/arm_cmse.h | 7 ++++++ gcc/doc/extend.texi | 1 + gcc/testsuite/ChangeLog | 6 +++++ gcc/testsuite/gcc.target/arm/cmse/cmse-1.c | 39 ++++++++++++++++++++++++++++++ 6 files changed, 82 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 807d406..65443a1 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,6 +1,16 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * config/arm/arm-builtins.c (arm_builtins): Define + ARM_BUILTIN_CMSE_NONSECURE_CALLER. + (bdesc_2arg): Add line for cmse_nonsecure_caller. + (arm_init_builtins): Handle cmse_nonsecure_caller. + (arm_expand_builtin): Likewise. + * config/arm/arm_cmse.h (cmse_nonsecure_caller): New. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * config/arm/arm.c (detect_cmse_nonsecure_call): New. (cmse_nonsecure_call_clear_caller_saved): New. (arm_reorg): Use cmse_nonsecure_call_clear_caller_saved. diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index 5ed38d1..1444420 100644 --- a/gcc/config/arm/arm-builtins.c +++ b/gcc/config/arm/arm-builtins.c @@ -528,6 +528,8 @@ enum arm_builtins ARM_BUILTIN_GET_FPSCR, ARM_BUILTIN_SET_FPSCR, + ARM_BUILTIN_CMSE_NONSECURE_CALLER, + #undef CRYPTO1 #undef CRYPTO2 #undef CRYPTO3 @@ -1833,6 +1835,17 @@ arm_init_builtins (void) = add_builtin_function ("__builtin_arm_stfscr", ftype_set_fpscr, ARM_BUILTIN_SET_FPSCR, BUILT_IN_MD, NULL, NULL_TREE); } + + if (use_cmse) + { + tree ftype_cmse_nonsecure_caller + = build_function_type_list (unsigned_type_node, NULL); + arm_builtin_decls[ARM_BUILTIN_CMSE_NONSECURE_CALLER] + = add_builtin_function ("__builtin_arm_cmse_nonsecure_caller", + ftype_cmse_nonsecure_caller, + ARM_BUILTIN_CMSE_NONSECURE_CALLER, BUILT_IN_MD, + NULL, NULL_TREE); + } } /* Return the ARM builtin for CODE. */ @@ -2453,6 +2466,12 @@ arm_expand_builtin (tree exp, emit_insn (pat); return target; + case ARM_BUILTIN_CMSE_NONSECURE_CALLER: + target = gen_reg_rtx (SImode); + op0 = arm_return_addr (0, NULL_RTX); + emit_insn (gen_addsi3 (target, op0, const1_rtx)); + return target; + case ARM_BUILTIN_TEXTRMSB: case ARM_BUILTIN_TEXTRMUB: case ARM_BUILTIN_TEXTRMSH: diff --git a/gcc/config/arm/arm_cmse.h b/gcc/config/arm/arm_cmse.h index 894343b..82b58b1 100644 --- a/gcc/config/arm/arm_cmse.h +++ b/gcc/config/arm/arm_cmse.h @@ -163,6 +163,13 @@ __attribute__ ((__always_inline__)) cmse_TTAT (void *__p) __CMSE_TT_ASM (at) +/* FIXME: diagnose use outside cmse_nonsecure_entry functions. */ +__extension__ static __inline int __attribute__ ((__always_inline__)) +cmse_nonsecure_caller (void) +{ + return __builtin_arm_cmse_nonsecure_caller (); +} + #define CMSE_AU_NONSECURE 2 #define CMSE_MPU_NONSECURE 16 #define CMSE_NONSECURE 18 diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 0fa59ff..a8402e1 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -12666,6 +12666,7 @@ cmse_address_info_t cmse_TTAT_fptr (FPTR) void * cmse_check_address_range (void *, size_t, int) typeof(p) cmse_nsfptr_create (FPTR p) intptr_t cmse_is_nsfptr (FPTR) +int cmse_nonsecure_caller (void) @end smallexample @node AVR Built-in Functions diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index e9786b7..7cb66d9 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,6 +1,12 @@ 2016-12-02 Andre Vieira Thomas Preud'homme + * gcc.target/arm/cmse/cmse-1.c: Add test for + cmse_nonsecure_caller. + +2016-12-02 Andre Vieira + Thomas Preud'homme + * gcc.target/arm/cmse/cmse.exp: Run tests in mainline dir. * gcc.target/arm/cmse/cmse-9.c: Added some extra tests. * gcc.target/arm/cmse/cmse-14.c: New. diff --git a/gcc/testsuite/gcc.target/arm/cmse/cmse-1.c b/gcc/testsuite/gcc.target/arm/cmse/cmse-1.c index d5b9a2d..c13272e 100644 --- a/gcc/testsuite/gcc.target/arm/cmse/cmse-1.c +++ b/gcc/testsuite/gcc.target/arm/cmse/cmse-1.c @@ -65,3 +65,42 @@ int foo (char * p) /* { dg-final { scan-assembler-times "ttat " 2 } } */ /* { dg-final { scan-assembler-times "bl.cmse_check_address_range" 7 } } */ /* { dg-final { scan-assembler-not "cmse_check_pointed_object" } } */ + +int __attribute__ ((cmse_nonsecure_entry)) +baz (void) +{ + return cmse_nonsecure_caller (); +} + +typedef int __attribute__ ((cmse_nonsecure_call)) (int_nsfunc_t) (void); + +int default_callback (void) +{ + return 0; +} + +int_nsfunc_t * fp = (int_nsfunc_t *) default_callback; + +void __attribute__ ((cmse_nonsecure_entry)) +qux (int_nsfunc_t * callback) +{ + fp = cmse_nsfptr_create (callback); +} + +int call_callback (void) +{ + if (cmse_is_nsfptr (fp)) + return fp (); + else + return default_callback (); +} +/* { dg-final { scan-assembler "baz:" } } */ +/* { dg-final { scan-assembler "__acle_se_baz:" } } */ +/* { dg-final { scan-assembler "qux:" } } */ +/* { dg-final { scan-assembler "__acle_se_qux:" } } */ +/* { dg-final { scan-assembler-not "\tcmse_nonsecure_caller" } } */ +/* { dg-final { scan-rtl-dump "and.*reg.*const_int 1" expand } } */ +/* { dg-final { scan-assembler "bic" } } */ +/* { dg-final { scan-assembler "push\t\{r4, r5, r6" } } */ +/* { dg-final { scan-assembler "msr\tAPSR_nzcvq" } } */ +/* { dg-final { scan-assembler-times "bl\\s+__gnu_cmse_nonsecure_call" 1 } } */ -- cgit v1.1 From eb61d07edaf05f36151bfe4382777eaa79bce4d9 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 2 Dec 2016 16:42:04 +0100 Subject: re PR target/78614 (ICE error: invalid rtl sharing found in the insn (verify_rtx_sharing) gcc/emit-rtl.c:2743) PR target/78614 * rtl.c (copy_rtx): Don't clear used flag here. (shallow_copy_rtx_stat): Clear used flag here unless code the rtx is shareable. * simplify-rtx.c (simplify_replace_fn_rtx): When copying rtx with 'E' in format, copy all vectors. * emit-rtl.c (copy_insn_1): Don't clear used flag here. * valtrack.c (cleanup_auto_inc_dec): Likewise. * config/rs6000/rs6000.c (rs6000_frame_related): Likewise. From-SVN: r243194 --- gcc/ChangeLog | 12 ++++++++++++ gcc/config/rs6000/rs6000.c | 1 - gcc/emit-rtl.c | 4 ---- gcc/rtl.c | 28 +++++++++++++++++++++++----- gcc/valtrack.c | 4 ---- 5 files changed, 35 insertions(+), 14 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 65443a1..68d3588 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,15 @@ +2016-12-02 Jakub Jelinek + + PR target/78614 + * rtl.c (copy_rtx): Don't clear used flag here. + (shallow_copy_rtx_stat): Clear used flag here unless code the rtx + is shareable. + * simplify-rtx.c (simplify_replace_fn_rtx): When copying rtx with + 'E' in format, copy all vectors. + * emit-rtl.c (copy_insn_1): Don't clear used flag here. + * valtrack.c (cleanup_auto_inc_dec): Likewise. + * config/rs6000/rs6000.c (rs6000_frame_related): Likewise. + 2016-12-02 Andre Vieira Thomas Preud'homme diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 425a885..59bd3fe 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -27186,7 +27186,6 @@ rs6000_frame_related (rtx_insn *insn, rtx reg, HOST_WIDE_INT val, { pat = shallow_copy_rtx (pat); XVEC (pat, 0) = shallow_copy_rtvec (XVEC (pat, 0)); - RTX_FLAG (pat, used) = 0; for (int i = 0; i < XVECLEN (pat, 0); i++) if (GET_CODE (XVECEXP (pat, 0, i)) == SET) diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c index d2ac88b..4650540 100644 --- a/gcc/emit-rtl.c +++ b/gcc/emit-rtl.c @@ -5552,10 +5552,6 @@ copy_insn_1 (rtx orig) us to explicitly document why we are *not* copying a flag. */ copy = shallow_copy_rtx (orig); - /* We do not copy the USED flag, which is used as a mark bit during - walks over the RTL. */ - RTX_FLAG (copy, used) = 0; - /* We do not copy JUMP, CALL, or FRAME_RELATED for INSNs. */ if (INSN_P (orig)) { diff --git a/gcc/rtl.c b/gcc/rtl.c index 3fac1931..0410f01 100644 --- a/gcc/rtl.c +++ b/gcc/rtl.c @@ -318,10 +318,6 @@ copy_rtx (rtx orig) us to explicitly document why we are *not* copying a flag. */ copy = shallow_copy_rtx (orig); - /* We do not copy the USED flag, which is used as a mark bit during - walks over the RTL. */ - RTX_FLAG (copy, used) = 0; - format_ptr = GET_RTX_FORMAT (GET_CODE (copy)); for (i = 0; i < GET_RTX_LENGTH (GET_CODE (copy)); i++) @@ -367,7 +363,29 @@ shallow_copy_rtx_stat (const_rtx orig MEM_STAT_DECL) { const unsigned int size = rtx_size (orig); rtx const copy = ggc_alloc_rtx_def_stat (size PASS_MEM_STAT); - return (rtx) memcpy (copy, orig, size); + memcpy (copy, orig, size); + switch (GET_CODE (orig)) + { + /* RTX codes copy_rtx_if_shared_1 considers are shareable, + the used flag is often used for other purposes. */ + case REG: + case DEBUG_EXPR: + case VALUE: + CASE_CONST_ANY: + case SYMBOL_REF: + case CODE_LABEL: + case PC: + case CC0: + case RETURN: + case SIMPLE_RETURN: + case SCRATCH: + break; + default: + /* For all other RTXes clear the used flag on the copy. */ + RTX_FLAG (copy, used) = 0; + break; + } + return copy; } /* Nonzero when we are generating CONCATs. */ diff --git a/gcc/valtrack.c b/gcc/valtrack.c index 9a1ae2d..002f49f 100644 --- a/gcc/valtrack.c +++ b/gcc/valtrack.c @@ -119,10 +119,6 @@ cleanup_auto_inc_dec (rtx src, machine_mode mem_mode ATTRIBUTE_UNUSED) us to explicitly document why we are *not* copying a flag. */ x = shallow_copy_rtx (x); - /* We do not copy the USED flag, which is used as a mark bit during - walks over the RTL. */ - RTX_FLAG (x, used) = 0; - /* We do not copy FRAME_RELATED for INSNs. */ if (INSN_P (x)) RTX_FLAG (x, frame_related) = 0; -- cgit v1.1 From 17c69eff82d20174099bad6bbd67dbf5e76c39a5 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 2 Dec 2016 17:28:41 +0100 Subject: re PR target/70322 (STV doesn't optimize andn) PR target/70322 * config/i386/i386.c (dimode_scalar_to_vector_candidate_p): Handle NOT. (dimode_scalar_chain::compute_convert_gain): Likewise. (dimode_scalar_chain::convert_insn): Likewise. * config/i386/i386.md (*one_cmpldi2_doubleword): New define_insn_and_split. (one_cmpl2): Use SWIM1248x iterator instead of SWIM. * gcc.target/i386/pr70322-1.c: New test. * gcc.target/i386/pr70322-2.c: New test. * gcc.target/i386/pr70322-3.c: New test. From-SVN: r243195 --- gcc/ChangeLog | 9 +++++++++ gcc/config/i386/i386.c | 16 +++++++++++++++- gcc/config/i386/i386.md | 17 +++++++++++++++-- gcc/testsuite/ChangeLog | 7 +++++++ gcc/testsuite/gcc.target/i386/pr70322-1.c | 12 ++++++++++++ gcc/testsuite/gcc.target/i386/pr70322-2.c | 12 ++++++++++++ gcc/testsuite/gcc.target/i386/pr70322-3.c | 13 +++++++++++++ 7 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr70322-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr70322-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr70322-3.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 68d3588..33fd2eb 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,14 @@ 2016-12-02 Jakub Jelinek + PR target/70322 + * config/i386/i386.c (dimode_scalar_to_vector_candidate_p): Handle + NOT. + (dimode_scalar_chain::compute_convert_gain): Likewise. + (dimode_scalar_chain::convert_insn): Likewise. + * config/i386/i386.md (*one_cmpldi2_doubleword): New + define_insn_and_split. + (one_cmpl2): Use SWIM1248x iterator instead of SWIM. + PR target/78614 * rtl.c (copy_rtx): Don't clear used flag here. (shallow_copy_rtx_stat): Clear used flag here unless code the rtx diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5678fa2..0bee09b 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2826,6 +2826,9 @@ dimode_scalar_to_vector_candidate_p (rtx_insn *insn) return false; break; + case NOT: + break; + case REG: return true; @@ -2848,7 +2851,8 @@ dimode_scalar_to_vector_candidate_p (rtx_insn *insn) if ((GET_MODE (XEXP (src, 0)) != DImode && !CONST_INT_P (XEXP (src, 0))) - || (GET_MODE (XEXP (src, 1)) != DImode + || (GET_CODE (src) != NOT + && GET_MODE (XEXP (src, 1)) != DImode && !CONST_INT_P (XEXP (src, 1)))) return false; @@ -3415,6 +3419,8 @@ dimode_scalar_chain::compute_convert_gain () if (CONST_INT_P (XEXP (src, 1))) gain -= vector_const_cost (XEXP (src, 1)); } + else if (GET_CODE (src) == NOT) + gain += ix86_cost->add - COSTS_N_INSNS (1); else if (GET_CODE (src) == COMPARE) { /* Assume comparison cost is the same. */ @@ -3770,6 +3776,14 @@ dimode_scalar_chain::convert_insn (rtx_insn *insn) PUT_MODE (src, V2DImode); break; + case NOT: + src = XEXP (src, 0); + convert_op (&src, insn); + subreg = gen_reg_rtx (V2DImode); + emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn); + src = gen_rtx_XOR (V2DImode, src, subreg); + break; + case MEM: if (!REG_P (dst)) convert_op (&src, insn); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 583d2bb..da7cb07 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -9312,9 +9312,22 @@ ;; One complement instructions +(define_insn_and_split "*one_cmpldi2_doubleword" + [(set (match_operand:DI 0 "nonimmediate_operand" "=rm") + (not:DI (match_operand:DI 1 "nonimmediate_operand" "0")))] + "!TARGET_64BIT && TARGET_STV && TARGET_SSE2 + && ix86_unary_operator_ok (NOT, DImode, operands)" + "#" + "&& reload_completed" + [(set (match_dup 0) + (not:SI (match_dup 1))) + (set (match_dup 2) + (not:SI (match_dup 3)))] + "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);") + (define_expand "one_cmpl2" - [(set (match_operand:SWIM 0 "nonimmediate_operand") - (not:SWIM (match_operand:SWIM 1 "nonimmediate_operand")))] + [(set (match_operand:SWIM1248x 0 "nonimmediate_operand") + (not:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")))] "" "ix86_expand_unary_operator (NOT, mode, operands); DONE;") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 7cb66d9..3a478b9 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2016-12-02 Jakub Jelinek + + PR target/70322 + * gcc.target/i386/pr70322-1.c: New test. + * gcc.target/i386/pr70322-2.c: New test. + * gcc.target/i386/pr70322-3.c: New test. + 2016-12-02 Andre Vieira Thomas Preud'homme diff --git a/gcc/testsuite/gcc.target/i386/pr70322-1.c b/gcc/testsuite/gcc.target/i386/pr70322-1.c new file mode 100644 index 0000000..bc10675 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr70322-1.c @@ -0,0 +1,12 @@ +/* PR target/70322 */ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-O2 -msse2 -mstv -mbmi" } */ +/* { dg-final { scan-assembler "pandn" } } */ + +extern long long z; + +void +foo (long long x, long long y) +{ + z = ~x & y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr70322-2.c b/gcc/testsuite/gcc.target/i386/pr70322-2.c new file mode 100644 index 0000000..7c5d0be --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr70322-2.c @@ -0,0 +1,12 @@ +/* PR target/70322 */ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-O2 -msse2 -mstv -mno-bmi" } */ +/* { dg-final { scan-assembler "pandn" { xfail *-*-* } } } */ + +extern long long z; + +void +foo (long long x, long long y) +{ + z = ~x & y; +} diff --git a/gcc/testsuite/gcc.target/i386/pr70322-3.c b/gcc/testsuite/gcc.target/i386/pr70322-3.c new file mode 100644 index 0000000..89a8da3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr70322-3.c @@ -0,0 +1,13 @@ +/* PR target/70322 */ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-O2 -msse2 -mstv" } */ +/* { dg-final { scan-assembler "pxor" } } */ +/* { dg-final { scan-assembler "por" } } */ + +extern long long z; + +void +foo (long long x, long long y) +{ + z = ~x | y; +} -- cgit v1.1 From d2c82d98de3100a10731d982bf3fe571dba03e13 Mon Sep 17 00:00:00 2001 From: Tadek Kijkowski Date: Fri, 2 Dec 2016 16:34:28 +0000 Subject: Makefile.in (PREPROCESSOR_DEFINES): Add a level of indirection for several include directories that may be relative... * Makefile.in (PREPROCESSOR_DEFINES): Add a level of indirection for several include directories that may be relative to sysroot. * config/i386/x-mingw32 (gplus_includedir): Define. (gplus_tool_includedir, gplus_backward_include_dir): Likewise. (native_system_includedir): Likewise. * config/i386/mingw32.h (STANDARD_STARTFILE_PREFIX_1): Do not override if TARGET_SYSTEM_ROOT is defined. (NATIVE_SYSTEM_HEADER_DIR): Likewise. From-SVN: r243196 --- gcc/ChangeLog | 11 +++++++++++ gcc/Makefile.in | 17 +++++++++++++---- gcc/config/i386/mingw32.h | 4 ++++ gcc/config/i386/x-mingw32 | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 4 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 33fd2eb..c984483 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2016-12-02 Tadek Kijkowski + + * Makefile.in (PREPROCESSOR_DEFINES): Add a level of indirection + for several include directories that may be relative to sysroot. + * config/i386/x-mingw32 (gplus_includedir): Define. + (gplus_tool_includedir, gplus_backward_include_dir): Likewise. + (native_system_includedir): Likewise. + * config/i386/mingw32.h (STANDARD_STARTFILE_PREFIX_1): Do not + override if TARGET_SYSTEM_ROOT is defined. + (NATIVE_SYSTEM_HEADER_DIR): Likewise. + 2016-12-02 Jakub Jelinek PR target/70322 diff --git a/gcc/Makefile.in b/gcc/Makefile.in index df4f64f..c7b1eaf 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -675,6 +675,15 @@ objext = .o exeext = @host_exeext@ build_exeext = @build_exeext@ + +# This allows overriding include paths in host specific Makefile +# (config/i386/x-mingw32 overrides those variables and local_includedir). +gplusplus_includedir = $(gcc_gxx_include_dir) +gplusplus_tool_includedir = $(gcc_gxx_include_dir)/$(target_noncanonical) +gplusplus_backward_includedir = $(gcc_gxx_include_dir)/backward +native_system_includedir = $(NATIVE_SYSTEM_HEADER_DIR) + + # Directory in which to put man pages. mandir = @mandir@ man1dir = $(mandir)/man1 @@ -2768,14 +2777,14 @@ CFLAGS-intl.o += -DLOCALEDIR=\"$(localedir)\" PREPROCESSOR_DEFINES = \ -DGCC_INCLUDE_DIR=\"$(libsubdir)/include\" \ -DFIXED_INCLUDE_DIR=\"$(libsubdir)/include-fixed\" \ - -DGPLUSPLUS_INCLUDE_DIR=\"$(gcc_gxx_include_dir)\" \ + -DGPLUSPLUS_INCLUDE_DIR=\"$(gplusplus_includedir)\" \ -DGPLUSPLUS_INCLUDE_DIR_ADD_SYSROOT=$(gcc_gxx_include_dir_add_sysroot) \ - -DGPLUSPLUS_TOOL_INCLUDE_DIR=\"$(gcc_gxx_include_dir)/$(target_noncanonical)\" \ - -DGPLUSPLUS_BACKWARD_INCLUDE_DIR=\"$(gcc_gxx_include_dir)/backward\" \ + -DGPLUSPLUS_TOOL_INCLUDE_DIR=\"$(gplusplus_tool_includedir)\" \ + -DGPLUSPLUS_BACKWARD_INCLUDE_DIR=\"$(gplusplus_backward_includedir)\" \ -DLOCAL_INCLUDE_DIR=\"$(local_includedir)\" \ -DCROSS_INCLUDE_DIR=\"$(CROSS_SYSTEM_HEADER_DIR)\" \ -DTOOL_INCLUDE_DIR=\"$(gcc_tooldir)/include\" \ - -DNATIVE_SYSTEM_HEADER_DIR=\"$(NATIVE_SYSTEM_HEADER_DIR)\" \ + -DNATIVE_SYSTEM_HEADER_DIR=\"$(native_system_includedir)\" \ -DPREFIX=\"$(prefix)/\" \ -DSTANDARD_EXEC_PREFIX=\"$(libdir)/gcc/\" \ @TARGET_SYSTEM_ROOT_DEFINE@ diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h index ac4aa0d..258d3cd 100644 --- a/gcc/config/i386/mingw32.h +++ b/gcc/config/i386/mingw32.h @@ -161,6 +161,8 @@ along with GCC; see the file COPYING3. If not see fvtable-verify=std:vtv_end.o%s} \ crtend.o%s" +#if !defined(TARGET_SYSTEM_ROOT) + /* Override startfile prefix defaults. */ #ifndef STANDARD_STARTFILE_PREFIX_1 #define STANDARD_STARTFILE_PREFIX_1 "/mingw/lib/" @@ -174,6 +176,8 @@ along with GCC; see the file COPYING3. If not see #undef NATIVE_SYSTEM_HEADER_DIR #define NATIVE_SYSTEM_HEADER_DIR "/mingw/include" +#endif /* !defined(TARGET_SYSTEM_ROOT) */ + /* Output STRING, a string representing a filename, to FILE. We canonicalize it to be in Unix format (backslashes are replaced forward slashes. */ diff --git a/gcc/config/i386/x-mingw32 b/gcc/config/i386/x-mingw32 index 1d28a70..6eb6167 100644 --- a/gcc/config/i386/x-mingw32 +++ b/gcc/config/i386/x-mingw32 @@ -16,10 +16,44 @@ # along with GCC; see the file COPYING3. If not see # . # + +# MSYS will zealously translate all paths to Windows form, so /usr/include becomes c:/msysX/usr/include. +# This is undesirable when TARGET_SYSTEM_ROOT is specified, so this function converts /usr/include to //usr\include, +# which will become /usr/include again when passed to gcc. + +# This function takes two parameters: first parameter is include directory path, second parameter tells +# if the path is relative to TARGET_SYSTEM_ROOT. +# If TARGET_SYSTEM_ROOT is not configured, or +# this function always expands to the unmodified first parameter +# if TARGET_SYSTEM_ROOT is configured, but second parameter is not 1, +# this function again expands to the unmodified first parameter +# otherwise, +# it expands to a shell expression which will transform the first parameter as described above. +ifneq ($(TARGET_SYSTEM_ROOT),) +sysroot_relative_path = $(if $(filter 1,$(2)),`echo "$(1)" | tr '/' '\\\\' | sed 's|^\\\\|//|'`,$(1)) +else +sysroot_relative_path = $(1) +endif + +ifneq ($(TARGET_SYSTEM_ROOT),) +# +# Make sure that relative the path is not converted to absolute DOS style path +# +local_includedir = $(call sysroot_relative_path,$(local_prefix)/include,1) +else # # Make local_includedir relative to EXEC_PREFIX # local_includedir=$(libsubdir)/$(unlibsubdir)/..`echo $(exec_prefix) | sed -e 's|^$(prefix)||' -e 's|/[^/]*|/..|g'`/include +endif + +# +# Make sure that relative path are not converted to absolute DOS style paths +# +gplusplus_includedir = $(call sysroot_relative_path,$(gcc_gxx_include_dir),$(gcc_gxx_include_dir_add_sysroot)) +gplusplus_tool_includedir = $(call sysroot_relative_path,$(gcc_gxx_include_dir)/$(target_noncanonical),$(gcc_gxx_include_dir_add_sysroot)) +gplusplus_backward_includedir = $(call sysroot_relative_path,$(gcc_gxx_include_dir)/backward,$(gcc_gxx_include_dir_add_sysroot)) +native_system_includedir = $(call sysroot_relative_path,$(NATIVE_SYSTEM_HEADER_DIR),1) # On MinGW, we use "%IA64d" to print 64-bit integers, and the format-checking # code does not handle that, so we have to disable checking here. -- cgit v1.1 From 2e3af7e2f4b217104b1706994d9f384083a6d98c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 2 Dec 2016 17:53:23 +0100 Subject: alpha.md (exception_receiver): Copy alpha_gp_ave_rtx return value. * config/alpha/alpha.md (exception_receiver): Copy alpha_gp_ave_rtx return value. From-SVN: r243197 --- gcc/ChangeLog | 10 +++++++--- gcc/config/alpha/alpha.md | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c984483..29e40ae 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Uros Bizjak + + * config/alpha/alpha.md (exception_receiver): Copy + alpha_gp_ave_rtx return value. + 2016-12-02 Tadek Kijkowski * Makefile.in (PREPROCESSOR_DEFINES): Add a level of indirection @@ -12,8 +17,7 @@ 2016-12-02 Jakub Jelinek PR target/70322 - * config/i386/i386.c (dimode_scalar_to_vector_candidate_p): Handle - NOT. + * config/i386/i386.c (dimode_scalar_to_vector_candidate_p): Handle NOT. (dimode_scalar_chain::compute_convert_gain): Likewise. (dimode_scalar_chain::convert_insn): Likewise. * config/i386/i386.md (*one_cmpldi2_doubleword): New @@ -341,7 +345,7 @@ early_dwarf_finished. 2016-12-01 Eric Botcazou - David S. Miller + David S. Miller * config/sparc/sparc.opt (mlra): New target option. * config/sparc/sparc.c (TARGET_LRA_P): Define to... diff --git a/gcc/config/alpha/alpha.md b/gcc/config/alpha/alpha.md index 3e4594b..0ed29de 100644 --- a/gcc/config/alpha/alpha.md +++ b/gcc/config/alpha/alpha.md @@ -5142,7 +5142,7 @@ "TARGET_ABI_OSF" { if (flag_reorder_blocks_and_partition) - operands[0] = alpha_gp_save_rtx (); + operands[0] = copy_rtx (alpha_gp_save_rtx ()); else operands[0] = const0_rtx; }) -- cgit v1.1 From 8a87dced20fc6cc3951038df3bdc30e453af5fb9 Mon Sep 17 00:00:00 2001 From: Martin Jambor Date: Fri, 2 Dec 2016 18:05:10 +0100 Subject: Move rebuild_cfg to the end of build_ssa_passes 2016-12-02 Martin Jambor * passes.def: Move pass_rebuild_cgraph_edges to the end of pass_build_ssa_passes. From-SVN: r243199 --- gcc/ChangeLog | 5 +++++ gcc/passes.def | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 29e40ae..47b3f84 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Martin Jambor + + * passes.def: Move pass_rebuild_cgraph_edges to the end of + pass_build_ssa_passes. + 2016-12-02 Uros Bizjak * config/alpha/alpha.md (exception_receiver): Copy diff --git a/gcc/passes.def b/gcc/passes.def index 1117b8b..7b12a41 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -56,12 +56,12 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_build_ssa_passes); PUSH_INSERT_PASSES_WITHIN (pass_build_ssa_passes) NEXT_PASS (pass_fixup_cfg); - NEXT_PASS (pass_rebuild_cgraph_edges); NEXT_PASS (pass_build_ssa); NEXT_PASS (pass_warn_nonnull_compare); NEXT_PASS (pass_ubsan); NEXT_PASS (pass_early_warn_uninitialized); NEXT_PASS (pass_nothrow); + NEXT_PASS (pass_rebuild_cgraph_edges); POP_INSERT_PASSES () NEXT_PASS (pass_chkp_instrumentation_passes); -- cgit v1.1 From 827ab47ab1f9f9b9b108a252b7a43c3c7bc828b7 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Fri, 2 Dec 2016 17:13:08 +0000 Subject: [AArch64] Separate shrink wrapping hooks implementation * config/aarch64/aarch64.h (machine_function): Add reg_is_wrapped_separately field. * config/aarch64/aarch64.md (LAST_SAVED_REGNUM): Define new constant. * config/aarch64/aarch64.c (emit_set_insn): Change return type to rtx_insn *. (aarch64_save_callee_saves): Don't save registers that are wrapped separately. (aarch64_restore_callee_saves): Don't restore registers that are wrapped separately. (offset_9bit_signed_unscaled_p, offset_12bit_unsigned_scaled_p, aarch64_offset_7bit_signed_scaled_p): Move earlier in the file. (aarch64_get_separate_components): New function. (aarch64_get_next_set_bit): Likewise. (aarch64_components_for_bb): Likewise. (aarch64_disqualify_components): Likewise. (aarch64_emit_prologue_components): Likewise. (aarch64_emit_epilogue_components): Likewise. (aarch64_set_handled_components): Likewise. (aarch64_process_components): Likewise. (TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS, TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB, TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS, TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS, TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS, TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Define. From-SVN: r243200 --- gcc/ChangeLog | 28 ++++ gcc/config/aarch64/aarch64.c | 296 ++++++++++++++++++++++++++++++++++++++---- gcc/config/aarch64/aarch64.h | 2 + gcc/config/aarch64/aarch64.md | 1 + 4 files changed, 303 insertions(+), 24 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 47b3f84..7957649 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,31 @@ +2016-12-02 Kyrylo Tkachov + + * config/aarch64/aarch64.h (machine_function): Add + reg_is_wrapped_separately field. + * config/aarch64/aarch64.md (LAST_SAVED_REGNUM): Define new constant. + * config/aarch64/aarch64.c (emit_set_insn): Change return type to + rtx_insn *. + (aarch64_save_callee_saves): Don't save registers that are wrapped + separately. + (aarch64_restore_callee_saves): Don't restore registers that are + wrapped separately. + (offset_9bit_signed_unscaled_p, offset_12bit_unsigned_scaled_p, + aarch64_offset_7bit_signed_scaled_p): Move earlier in the file. + (aarch64_get_separate_components): New function. + (aarch64_get_next_set_bit): Likewise. + (aarch64_components_for_bb): Likewise. + (aarch64_disqualify_components): Likewise. + (aarch64_emit_prologue_components): Likewise. + (aarch64_emit_epilogue_components): Likewise. + (aarch64_set_handled_components): Likewise. + (aarch64_process_components): Likewise. + (TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS, + TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB, + TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS, + TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS, + TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS, + TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Define. + 2016-12-02 Martin Jambor * passes.def: Move pass_rebuild_cgraph_edges to the end of diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 68a3380..af3aa0b 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1137,7 +1137,7 @@ aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm, /* Emit an insn that's a simple single-set. Both the operands must be known to be valid. */ -inline static rtx +inline static rtx_insn * emit_set_insn (rtx x, rtx y) { return emit_insn (gen_rtx_SET (x, y)); @@ -3134,6 +3134,9 @@ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset, || regno == cfun->machine->frame.wb_candidate2)) continue; + if (cfun->machine->reg_is_wrapped_separately[regno]) + continue; + reg = gen_rtx_REG (mode, regno); offset = start_offset + cfun->machine->frame.reg_offset[regno]; mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx, @@ -3142,6 +3145,7 @@ aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset, regno2 = aarch64_next_callee_save (regno + 1, limit); if (regno2 <= limit + && !cfun->machine->reg_is_wrapped_separately[regno2] && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD) == cfun->machine->frame.reg_offset[regno2])) @@ -3190,6 +3194,9 @@ aarch64_restore_callee_saves (machine_mode mode, regno <= limit; regno = aarch64_next_callee_save (regno + 1, limit)) { + if (cfun->machine->reg_is_wrapped_separately[regno]) + continue; + rtx reg, mem; if (skip_wb @@ -3204,6 +3211,7 @@ aarch64_restore_callee_saves (machine_mode mode, regno2 = aarch64_next_callee_save (regno + 1, limit); if (regno2 <= limit + && !cfun->machine->reg_is_wrapped_separately[regno2] && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD) == cfun->machine->frame.reg_offset[regno2])) { @@ -3223,6 +3231,245 @@ aarch64_restore_callee_saves (machine_mode mode, } } +static inline bool +offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED, + HOST_WIDE_INT offset) +{ + return offset >= -256 && offset < 256; +} + +static inline bool +offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset) +{ + return (offset >= 0 + && offset < 4096 * GET_MODE_SIZE (mode) + && offset % GET_MODE_SIZE (mode) == 0); +} + +bool +aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset) +{ + return (offset >= -64 * GET_MODE_SIZE (mode) + && offset < 64 * GET_MODE_SIZE (mode) + && offset % GET_MODE_SIZE (mode) == 0); +} + +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */ + +static sbitmap +aarch64_get_separate_components (void) +{ + aarch64_layout_frame (); + + sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); + bitmap_clear (components); + + /* The registers we need saved to the frame. */ + for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) + if (aarch64_register_saved_on_entry (regno)) + { + HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno]; + if (!frame_pointer_needed) + offset += cfun->machine->frame.frame_size + - cfun->machine->frame.hard_fp_offset; + /* Check that we can access the stack slot of the register with one + direct load with no adjustments needed. */ + if (offset_12bit_unsigned_scaled_p (DImode, offset)) + bitmap_set_bit (components, regno); + } + + /* Don't mess with the hard frame pointer. */ + if (frame_pointer_needed) + bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM); + + unsigned reg1 = cfun->machine->frame.wb_candidate1; + unsigned reg2 = cfun->machine->frame.wb_candidate2; + /* If aarch64_layout_frame has chosen registers to store/restore with + writeback don't interfere with them to avoid having to output explicit + stack adjustment instructions. */ + if (reg2 != INVALID_REGNUM) + bitmap_clear_bit (components, reg2); + if (reg1 != INVALID_REGNUM) + bitmap_clear_bit (components, reg1); + + bitmap_clear_bit (components, LR_REGNUM); + bitmap_clear_bit (components, SP_REGNUM); + + return components; +} + +/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */ + +static sbitmap +aarch64_components_for_bb (basic_block bb) +{ + bitmap in = DF_LIVE_IN (bb); + bitmap gen = &DF_LIVE_BB_INFO (bb)->gen; + bitmap kill = &DF_LIVE_BB_INFO (bb)->kill; + + sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); + bitmap_clear (components); + + /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */ + for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) + if ((!call_used_regs[regno]) + && (bitmap_bit_p (in, regno) + || bitmap_bit_p (gen, regno) + || bitmap_bit_p (kill, regno))) + bitmap_set_bit (components, regno); + + return components; +} + +/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS. + Nothing to do for aarch64. */ + +static void +aarch64_disqualify_components (sbitmap, edge, sbitmap, bool) +{ +} + +/* Return the next set bit in BMP from START onwards. Return the total number + of bits in BMP if no set bit is found at or after START. */ + +static unsigned int +aarch64_get_next_set_bit (sbitmap bmp, unsigned int start) +{ + unsigned int nbits = SBITMAP_SIZE (bmp); + if (start == nbits) + return start; + + gcc_assert (start < nbits); + for (unsigned int i = start; i < nbits; i++) + if (bitmap_bit_p (bmp, i)) + return i; + + return nbits; +} + +/* Do the work for aarch64_emit_prologue_components and + aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers + to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence + for these components or the epilogue sequence. That is, it determines + whether we should emit stores or loads and what kind of CFA notes to attach + to the insns. Otherwise the logic for the two sequences is very + similar. */ + +static void +aarch64_process_components (sbitmap components, bool prologue_p) +{ + rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed + ? HARD_FRAME_POINTER_REGNUM + : STACK_POINTER_REGNUM); + + unsigned last_regno = SBITMAP_SIZE (components); + unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM); + rtx_insn *insn = NULL; + + while (regno != last_regno) + { + /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved + so DFmode for the vector registers is enough. */ + machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode; + rtx reg = gen_rtx_REG (mode, regno); + HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno]; + if (!frame_pointer_needed) + offset += cfun->machine->frame.frame_size + - cfun->machine->frame.hard_fp_offset; + rtx addr = plus_constant (Pmode, ptr_reg, offset); + rtx mem = gen_frame_mem (mode, addr); + + rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem); + unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1); + /* No more registers to handle after REGNO. + Emit a single save/restore and exit. */ + if (regno2 == last_regno) + { + insn = emit_insn (set); + RTX_FRAME_RELATED_P (insn) = 1; + if (prologue_p) + add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set)); + else + add_reg_note (insn, REG_CFA_RESTORE, reg); + break; + } + + HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2]; + /* The next register is not of the same class or its offset is not + mergeable with the current one into a pair. */ + if (!satisfies_constraint_Ump (mem) + || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2) + || (offset2 - cfun->machine->frame.reg_offset[regno]) + != GET_MODE_SIZE (mode)) + { + insn = emit_insn (set); + RTX_FRAME_RELATED_P (insn) = 1; + if (prologue_p) + add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set)); + else + add_reg_note (insn, REG_CFA_RESTORE, reg); + + regno = regno2; + continue; + } + + /* REGNO2 can be saved/restored in a pair with REGNO. */ + rtx reg2 = gen_rtx_REG (mode, regno2); + if (!frame_pointer_needed) + offset2 += cfun->machine->frame.frame_size + - cfun->machine->frame.hard_fp_offset; + rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); + rtx mem2 = gen_frame_mem (mode, addr2); + rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) + : gen_rtx_SET (reg2, mem2); + + if (prologue_p) + insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2)); + else + insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2)); + + RTX_FRAME_RELATED_P (insn) = 1; + if (prologue_p) + { + add_reg_note (insn, REG_CFA_OFFSET, set); + add_reg_note (insn, REG_CFA_OFFSET, set2); + } + else + { + add_reg_note (insn, REG_CFA_RESTORE, reg); + add_reg_note (insn, REG_CFA_RESTORE, reg2); + } + + regno = aarch64_get_next_set_bit (components, regno2 + 1); + } +} + +/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */ + +static void +aarch64_emit_prologue_components (sbitmap components) +{ + aarch64_process_components (components, true); +} + +/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */ + +static void +aarch64_emit_epilogue_components (sbitmap components) +{ + aarch64_process_components (components, false); +} + +/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */ + +static void +aarch64_set_handled_components (sbitmap components) +{ + for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) + if (bitmap_bit_p (components, regno)) + cfun->machine->reg_is_wrapped_separately[regno] = true; +} + /* AArch64 stack frames generated by this compiler look like: +-------------------------------+ @@ -3981,29 +4228,6 @@ aarch64_classify_index (struct aarch64_address_info *info, rtx x, return false; } -bool -aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset) -{ - return (offset >= -64 * GET_MODE_SIZE (mode) - && offset < 64 * GET_MODE_SIZE (mode) - && offset % GET_MODE_SIZE (mode) == 0); -} - -static inline bool -offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED, - HOST_WIDE_INT offset) -{ - return offset >= -256 && offset < 256; -} - -static inline bool -offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset) -{ - return (offset >= 0 - && offset < 4096 * GET_MODE_SIZE (mode) - && offset % GET_MODE_SIZE (mode) == 0); -} - /* Return true if MODE is one of the modes for which we support LDP/STP operations. */ @@ -14567,6 +14791,30 @@ aarch64_libgcc_floating_mode_supported_p #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \ aarch64_first_cycle_multipass_dfa_lookahead_guard +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \ + aarch64_get_separate_components + +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \ + aarch64_components_for_bb + +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \ + aarch64_disqualify_components + +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \ + aarch64_emit_prologue_components + +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \ + aarch64_emit_epilogue_components + +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \ + aarch64_set_handled_components + #undef TARGET_TRAMPOLINE_INIT #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 584ff5c..c417569 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -591,6 +591,8 @@ struct GTY (()) aarch64_frame typedef struct GTY (()) machine_function { struct aarch64_frame frame; + /* One entry for each hard register. */ + bool reg_is_wrapped_separately[LAST_SAVED_REGNUM]; } machine_function; #endif diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index bc6d8a2..1e6b6f5 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -59,6 +59,7 @@ (V0_REGNUM 32) (V15_REGNUM 47) (V31_REGNUM 63) + (LAST_SAVED_REGNUM 63) (SFP_REGNUM 64) (AP_REGNUM 65) (CC_REGNUM 66) -- cgit v1.1 From 6ed022af2addc38c0d7c2b321d279a48f73f11c4 Mon Sep 17 00:00:00 2001 From: Janus Weil Date: Fri, 2 Dec 2016 19:38:24 +0100 Subject: [multiple changes] 2016-12-02 Janus Weil Steven G. Kargl PR fortran/78618 * check.c (gfc_check_rank): Remove ATTRIBUTE_UNUSED. * expr.c (gfc_check_assign): Fix error propagation. 2016-12-02 Steven G. Kargl PR fortran/78618 * gfortran.dg/char_conversion.f90: New test. From-SVN: r243201 --- gcc/fortran/ChangeLog | 7 +++++++ gcc/fortran/check.c | 2 +- gcc/fortran/expr.c | 6 +++--- gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/gfortran.dg/char_conversion.f90 | 10 ++++++++++ 5 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gfortran.dg/char_conversion.f90 diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index 20a9f2e..b11a999 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,3 +1,10 @@ +2016-12-02 Janus Weil + Steven G. Kargl + + PR fortran/78618 + * check.c (gfc_check_rank): Remove ATTRIBUTE_UNUSED. + * expr.c (gfc_check_assign): Fix error propagation. + 2016-12-01 Elizebeth Punnoose PR fortran/77505 diff --git a/gcc/fortran/check.c b/gcc/fortran/check.c index 3b80156..3ea8391 100644 --- a/gcc/fortran/check.c +++ b/gcc/fortran/check.c @@ -3667,7 +3667,7 @@ gfc_check_range (gfc_expr *x) bool -gfc_check_rank (gfc_expr *a ATTRIBUTE_UNUSED) +gfc_check_rank (gfc_expr *a) { /* Any data object is allowed; a "data object" is a "constant (4.1.3), variable (6), or subobject of a constant (2.4.3.2.3)" (F2008, 1.3.45). */ diff --git a/gcc/fortran/expr.c b/gcc/fortran/expr.c index 60f6080..c01418d 100644 --- a/gcc/fortran/expr.c +++ b/gcc/fortran/expr.c @@ -3314,9 +3314,9 @@ gfc_check_assign (gfc_expr *lvalue, gfc_expr *rvalue, int conform, if (lvalue->ts.type == BT_CHARACTER && rvalue->ts.type == BT_CHARACTER) { if (lvalue->ts.kind != rvalue->ts.kind && allow_convert) - gfc_convert_chartype (rvalue, &lvalue->ts); - - return true; + return gfc_convert_chartype (rvalue, &lvalue->ts); + else + return true; } if (!allow_convert) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 3a478b9..d7b90d6 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Steven G. Kargl + + PR fortran/78618 + * gfortran.dg/char_conversion.f90: New test. + 2016-12-02 Jakub Jelinek PR target/70322 diff --git a/gcc/testsuite/gfortran.dg/char_conversion.f90 b/gcc/testsuite/gfortran.dg/char_conversion.f90 new file mode 100644 index 0000000..7120a1c --- /dev/null +++ b/gcc/testsuite/gfortran.dg/char_conversion.f90 @@ -0,0 +1,10 @@ +! { dg-do compile } +! +! PR 78618: ICE in gfc_check_rank, at fortran/check.c:3670 +! +! Contributed by Gerhard Steinmetz + +program p + character, parameter :: c = char(256,4) ! { dg-error "cannot be converted" } + if (rank(c) /= 0) call abort +end -- cgit v1.1 From 81cff75ff9178e9344e1f4c8935ffd451f66632f Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 2 Dec 2016 19:48:35 +0100 Subject: re PR target/70322 (STV doesn't optimize andn) PR target/70322 * config/i386/i386.md (*andndi3_doubleword): Add non-BMI alternative and corresponding post-reload splitter. testsuite/ChangeLog: PR target/70322 * gcc.target/i386/pr70322-2.c (dg-final): Remove xfail. From-SVN: r243202 --- gcc/ChangeLog | 6 ++++ gcc/config/i386/i386.md | 51 +++++++++++++++++++++++-------- gcc/testsuite/ChangeLog | 5 +++ gcc/testsuite/gcc.target/i386/pr70322-2.c | 2 +- 4 files changed, 51 insertions(+), 13 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7957649..ee603c4 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2016-12-02 Uros Bizjak + + PR target/70322 + * config/i386/i386.md (*andndi3_doubleword): Add non-BMI alternative + and corresponding post-reload splitter. + 2016-12-02 Kyrylo Tkachov * config/aarch64/aarch64.h (machine_function): Add diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index da7cb07..773f29b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -8534,15 +8534,24 @@ operands[2] = gen_lowpart (QImode, operands[2]); }) -(define_insn_and_split "*andndi3_doubleword" - [(set (match_operand:DI 0 "register_operand" "=r") +(define_insn "*andndi3_doubleword" + [(set (match_operand:DI 0 "register_operand" "=r,&r") (and:DI - (not:DI (match_operand:DI 1 "register_operand" "r")) - (match_operand:DI 2 "nonimmediate_operand" "rm"))) + (not:DI (match_operand:DI 1 "register_operand" "r,0")) + (match_operand:DI 2 "nonimmediate_operand" "rm,rm"))) (clobber (reg:CC FLAGS_REG))] - "TARGET_BMI && !TARGET_64BIT && TARGET_STV && TARGET_SSE2" + "!TARGET_64BIT && TARGET_STV && TARGET_SSE2" "#" - "&& reload_completed" + [(set_attr "isa" "bmi,*")]) + +(define_split + [(set (match_operand:DI 0 "register_operand") + (and:DI + (not:DI (match_operand:DI 1 "register_operand")) + (match_operand:DI 2 "nonimmediate_operand"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT && TARGET_BMI && TARGET_STV && TARGET_SSE2 + && reload_completed" [(parallel [(set (match_dup 0) (and:SI (not:SI (match_dup 1)) (match_dup 2))) (clobber (reg:CC FLAGS_REG))]) @@ -8551,6 +8560,24 @@ (clobber (reg:CC FLAGS_REG))])] "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);") +(define_split + [(set (match_operand:DI 0 "register_operand") + (and:DI + (not:DI (match_dup 0)) + (match_operand:DI 1 "nonimmediate_operand"))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_64BIT && !TARGET_BMI && TARGET_STV && TARGET_SSE2 + && reload_completed" + [(set (match_dup 0) (not:SI (match_dup 0))) + (parallel [(set (match_dup 0) + (and:SI (match_dup 0) (match_dup 1))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_dup 2) (not:SI (match_dup 2))) + (parallel [(set (match_dup 2) + (and:SI (match_dup 2) (match_dup 3))) + (clobber (reg:CC FLAGS_REG))])] + "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);") + (define_insn "*andn_1" [(set (match_operand:SWI48 0 "register_operand" "=r,r") (and:SWI48 @@ -9312,6 +9339,12 @@ ;; One complement instructions +(define_expand "one_cmpl2" + [(set (match_operand:SWIM1248x 0 "nonimmediate_operand") + (not:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")))] + "" + "ix86_expand_unary_operator (NOT, mode, operands); DONE;") + (define_insn_and_split "*one_cmpldi2_doubleword" [(set (match_operand:DI 0 "nonimmediate_operand" "=rm") (not:DI (match_operand:DI 1 "nonimmediate_operand" "0")))] @@ -9325,12 +9358,6 @@ (not:SI (match_dup 3)))] "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);") -(define_expand "one_cmpl2" - [(set (match_operand:SWIM1248x 0 "nonimmediate_operand") - (not:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")))] - "" - "ix86_expand_unary_operator (NOT, mode, operands); DONE;") - (define_insn "*one_cmpl2_1" [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm") (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0")))] diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index d7b90d6..261731c 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Uros Bizjak + + PR target/70322 + * gcc.target/i386/pr70322-2.c (dg-final): Remove xfail. + 2016-12-02 Steven G. Kargl PR fortran/78618 diff --git a/gcc/testsuite/gcc.target/i386/pr70322-2.c b/gcc/testsuite/gcc.target/i386/pr70322-2.c index 7c5d0be..a683b6d 100644 --- a/gcc/testsuite/gcc.target/i386/pr70322-2.c +++ b/gcc/testsuite/gcc.target/i386/pr70322-2.c @@ -1,7 +1,7 @@ /* PR target/70322 */ /* { dg-do compile { target ia32 } } */ /* { dg-options "-O2 -msse2 -mstv -mno-bmi" } */ -/* { dg-final { scan-assembler "pandn" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler "pandn" } } */ extern long long z; -- cgit v1.1 From a3f7c896a55c33e369f9bbb210f4b43a345beabe Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Fri, 2 Dec 2016 22:23:22 +0100 Subject: =?UTF-8?q?re=20PR=20c++/78649=20(ICE=20on=20invalid=20C++=20code?= =?UTF-8?q?=20on=20x86=5F64-linux-gnu=20(internal=20compiler=20error:=20tr?= =?UTF-8?q?ee=20check:=20expected=20class=20=E2=80=98type=E2=80=99,=20have?= =?UTF-8?q?=20=E2=80=98exceptional=E2=80=99=20(error=5Fmark)=20in=20build?= =?UTF-8?q?=5Fvalue=5Finit=5Fnoctor,=20at=20cp/init.c:380))?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR c++/78649 * pt.c (tsubst_init): Don't call build_value_init if decl's type is error_mark_node. * g++.dg/cpp0x/pr78649.C: New test. From-SVN: r243204 --- gcc/cp/ChangeLog | 6 ++++++ gcc/cp/pt.c | 2 +- gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/g++.dg/cpp0x/pr78649.C | 16 ++++++++++++++++ 4 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/g++.dg/cpp0x/pr78649.C diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog index d39c222..f16813d 100644 --- a/gcc/cp/ChangeLog +++ b/gcc/cp/ChangeLog @@ -1,3 +1,9 @@ +2016-12-02 Jakub Jelinek + + PR c++/78649 + * pt.c (tsubst_init): Don't call build_value_init if decl's type + is error_mark_node. + 2016-12-02 Cesar Philippidis James Norris diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c index 8b0a21c..b51e580 100644 --- a/gcc/cp/pt.c +++ b/gcc/cp/pt.c @@ -14082,7 +14082,7 @@ tsubst_init (tree init, tree decl, tree args, init = tsubst_expr (init, args, complain, in_decl, false); - if (!init) + if (!init && TREE_TYPE (decl) != error_mark_node) { /* If we had an initializer but it instantiated to nothing, diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 261731c..103906b 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Jakub Jelinek + + PR c++/78649 + * g++.dg/cpp0x/pr78649.C: New test. + 2016-12-02 Uros Bizjak PR target/70322 diff --git a/gcc/testsuite/g++.dg/cpp0x/pr78649.C b/gcc/testsuite/g++.dg/cpp0x/pr78649.C new file mode 100644 index 0000000..43bcb64 --- /dev/null +++ b/gcc/testsuite/g++.dg/cpp0x/pr78649.C @@ -0,0 +1,16 @@ +// PR c++/78649 +// { dg-do compile { target c++11 } } + +template void foo (); +template +void +test () +{ + T t (foo...); // { dg-error "declared void" } +} + +int +main () +{ + test (); +} -- cgit v1.1 From 471092175dcb1ee2ecae398f897003310f1c2e24 Mon Sep 17 00:00:00 2001 From: "Steven G. Kargl" Date: Fri, 2 Dec 2016 22:09:13 +0000 Subject: simplify.c (gfc_convert_char_constant): Free result on error. 2016-12-02 Steven G. Kargl * simplify.c (gfc_convert_char_constant): Free result on error. From-SVN: r243205 --- gcc/fortran/ChangeLog | 4 ++++ gcc/fortran/simplify.c | 1 + 2 files changed, 5 insertions(+) diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index b11a999..68d0559 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,3 +1,7 @@ +2016-12-02 Steven G. Kargl + + * simplify.c (gfc_convert_char_constant): Free result on error. + 2016-12-02 Janus Weil Steven G. Kargl diff --git a/gcc/fortran/simplify.c b/gcc/fortran/simplify.c index 9047c63..a46fbc5 100644 --- a/gcc/fortran/simplify.c +++ b/gcc/fortran/simplify.c @@ -7152,6 +7152,7 @@ gfc_convert_char_constant (gfc_expr *e, bt type ATTRIBUTE_UNUSED, int kind) "into character kind %d", gfc_print_wide_char (result->value.character.string[i]), &e->where, kind); + gfc_free_expr (result); return &gfc_bad_expr; } -- cgit v1.1 From 6556f6516673c0c51167238d5f594d09cb2d435f Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Fri, 2 Dec 2016 22:12:08 +0000 Subject: re PR target/78639 (Power9 bad code generation for cactusADM benchmark) 2016-12-02 Michael Meissner PR target/78639 * config/rs6000/rs6000.md (movdi_internal64): Fix typo in subversion id 242679 that causes the wrong store instruction to be generated if a DImode is in an Altivec register using REG+REG addressing. From-SVN: r243206 --- gcc/ChangeLog | 8 ++++++++ gcc/config/rs6000/rs6000.md | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ee603c4..19394b0 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2016-12-02 Michael Meissner + + PR target/78639 + * config/rs6000/rs6000.md (movdi_internal64): Fix typo in + subversion id 242679 that causes the wrong store instruction to be + generated if a DImode is in an Altivec register using REG+REG + addressing. + 2016-12-02 Uros Bizjak PR target/70322 diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index f1ba8d4..5a453a0 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -8239,7 +8239,7 @@ (define_insn "*movdi_internal64" [(set (match_operand:DI 0 "nonimmediate_operand" "=Y, r, r, r, r, r, - ^m, ^d, ^d, ^Y, $Z, $wb, + ^m, ^d, ^d, ^wY, $Z, $wb, $wv, ^wi, *wo, *wo, *wv, *wi, *wi, *wv, *wv, r, *h, *h, ?*r, ?*wg, ?*r, ?*wj") -- cgit v1.1 From bf7f70ffb57c900d18fed28ad05fc2876d435b3d Mon Sep 17 00:00:00 2001 From: David Malcolm Date: Fri, 2 Dec 2016 22:39:43 +0000 Subject: selftest.c: remove calls to strndup (PR bootstrap/78616) gcc/ChangeLog: PR bootstrap/78616 * selftest.c (selftest::assert_strndup_eq): Rename to... (selftest::assert_xstrndup_eq): ...this, and remove call to strndup. (selftest::test_strndup): Rename to... (selftest::test_xstrndup): ...this, updating for above renaming. (selftest::test_libiberty): Update for renaming. From-SVN: r243207 --- gcc/ChangeLog | 10 ++++++++++ gcc/selftest.c | 40 +++++++++++++++++----------------------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 19394b0..18efa04 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2016-12-02 David Malcolm + + PR bootstrap/78616 + * selftest.c (selftest::assert_strndup_eq): Rename to... + (selftest::assert_xstrndup_eq): ...this, and remove call to + strndup. + (selftest::test_strndup): Rename to... + (selftest::test_xstrndup): ...this, updating for above renaming. + (selftest::test_libiberty): Update for renaming. + 2016-12-02 Michael Meissner PR target/78639 diff --git a/gcc/selftest.c b/gcc/selftest.c index 6df73c2..40c6cb5 100644 --- a/gcc/selftest.c +++ b/gcc/selftest.c @@ -200,41 +200,35 @@ read_file (const location &loc, const char *path) /* Selftests for libiberty. */ -/* Verify that both strndup and xstrndup generate EXPECTED - when called on SRC and N. */ +/* Verify that xstrndup generates EXPECTED when called on SRC and N. */ static void -assert_strndup_eq (const char *expected, const char *src, size_t n) +assert_xstrndup_eq (const char *expected, const char *src, size_t n) { - char *buf = strndup (src, n); - if (buf) - ASSERT_STREQ (expected, buf); - free (buf); - - buf = xstrndup (src, n); + char *buf = xstrndup (src, n); ASSERT_STREQ (expected, buf); free (buf); } -/* Verify that strndup and xstrndup work as expected. */ +/* Verify that xstrndup works as expected. */ static void -test_strndup () +test_xstrndup () { - assert_strndup_eq ("", "test", 0); - assert_strndup_eq ("t", "test", 1); - assert_strndup_eq ("te", "test", 2); - assert_strndup_eq ("tes", "test", 3); - assert_strndup_eq ("test", "test", 4); - assert_strndup_eq ("test", "test", 5); + assert_xstrndup_eq ("", "test", 0); + assert_xstrndup_eq ("t", "test", 1); + assert_xstrndup_eq ("te", "test", 2); + assert_xstrndup_eq ("tes", "test", 3); + assert_xstrndup_eq ("test", "test", 4); + assert_xstrndup_eq ("test", "test", 5); /* Test on an string without zero termination. */ const char src[4] = {'t', 'e', 's', 't'}; - assert_strndup_eq ("", src, 0); - assert_strndup_eq ("t", src, 1); - assert_strndup_eq ("te", src, 2); - assert_strndup_eq ("tes", src, 3); - assert_strndup_eq ("test", src, 4); + assert_xstrndup_eq ("", src, 0); + assert_xstrndup_eq ("t", src, 1); + assert_xstrndup_eq ("te", src, 2); + assert_xstrndup_eq ("tes", src, 3); + assert_xstrndup_eq ("test", src, 4); } /* Run selftests for libiberty. */ @@ -242,7 +236,7 @@ test_strndup () static void test_libiberty () { - test_strndup (); + test_xstrndup (); } /* Selftests for the selftest system itself. */ -- cgit v1.1 From 5bd12bafb5bebafa1a96f0f41c598b57c334576f Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Sat, 3 Dec 2016 00:51:31 +0100 Subject: simplify-rtx: Fix the last fix (PR78638) I managed to get the last obvious fix wrong: mode is M1, GET_MODE (op) is M2. * simplify-rtx.c (simplify_truncation): M2 is not mode, it is GET_MODE (op). Fix this. From-SVN: r243210 --- gcc/ChangeLog | 5 +++++ gcc/simplify-rtx.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 18efa04..0d57fe3 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Segher Boessenkool + + * simplify-rtx.c (simplify_truncation): M2 is not mode, it is + GET_MODE (op). Fix this. + 2016-12-02 David Malcolm PR bootstrap/78616 diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c index 7ed849f..165af23 100644 --- a/gcc/simplify-rtx.c +++ b/gcc/simplify-rtx.c @@ -752,7 +752,7 @@ simplify_truncation (machine_mode mode, rtx op, changing len. */ if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT) && REG_P (XEXP (op, 0)) - && GET_MODE (XEXP (op, 0)) == mode + && GET_MODE (XEXP (op, 0)) == GET_MODE (op) && CONST_INT_P (XEXP (op, 1)) && CONST_INT_P (XEXP (op, 2))) { -- cgit v1.1 From 9dd059b7c37796353782c330f73886f931d2b1bc Mon Sep 17 00:00:00 2001 From: GCC Administrator Date: Sat, 3 Dec 2016 00:16:15 +0000 Subject: Daily bump. From-SVN: r243214 --- gcc/DATESTAMP | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP index b720a20..8caabe1 100644 --- a/gcc/DATESTAMP +++ b/gcc/DATESTAMP @@ -1 +1 @@ -20161202 +20161203 -- cgit v1.1 From 25207f51c498deae4bf57a7ea87fe429cefee711 Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Sat, 3 Dec 2016 00:41:44 +0000 Subject: config.gcc (powerpc*-*-linux*): Set gnu-indirect-function by default on PowerPC linux systems. 2016-12-02 Michael Meissner * config.gcc (powerpc*-*-linux*): Set gnu-indirect-function by default on PowerPC linux systems. From-SVN: r243215 --- gcc/ChangeLog | 5 +++++ gcc/config.gcc | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 0d57fe3..d651cbd 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-02 Michael Meissner + + * config.gcc (powerpc*-*-linux*): Set gnu-indirect-function by + default on PowerPC linux systems. + 2016-12-02 Segher Boessenkool * simplify-rtx.c (simplify_truncation): M2 is not mode, it is diff --git a/gcc/config.gcc b/gcc/config.gcc index 1fa34ac..189073e 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -2443,6 +2443,14 @@ powerpc*-*-linux*) if test x${enable_secureplt} = xyes; then tm_file="rs6000/secureplt.h ${tm_file}" fi + # Assume modern glibc if not targeting Android nor uclibc. + case ${target} in + *-*-*android*|*-*-*uclibc*|*-*-*musl*) + ;; + *) + default_gnu_indirect_function=yes + ;; + esac ;; powerpc-wrs-vxworks|powerpc-wrs-vxworksae|powerpc-wrs-vxworksmils) tm_file="${tm_file} elfos.h freebsd-spec.h rs6000/sysv4.h" -- cgit v1.1 From 57c9def7d08e686b794ec64d9df15cd9ccf8d174 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Fri, 2 Dec 2016 19:02:51 -0700 Subject: arm.c (arm_handle_cmse_nonsecure_call): Remove unused variable main_variant. * config/arm/arm.c (arm_handle_cmse_nonsecure_call): Remove unused variable main_variant. From-SVN: r243216 --- gcc/ChangeLog | 5 +++++ gcc/config/arm/arm.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d651cbd..96ae900 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-01 Jeff Law + + * config/arm/arm.c (arm_handle_cmse_nonsecure_call): Remove unused + variable main_variant. + 2016-12-02 Michael Meissner * config.gcc (powerpc*-*-linux*): Set gnu-indirect-function by diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index f1df3a0..ec1f5fc 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -6774,7 +6774,7 @@ arm_handle_cmse_nonsecure_call (tree *node, tree name, bool *no_add_attrs) { tree decl = NULL_TREE, fntype = NULL_TREE; - tree main_variant, type; + tree type; if (!use_cmse) { -- cgit v1.1 From 98ff2d6b12106fa5c7e7a3249089ace171075fe3 Mon Sep 17 00:00:00 2001 From: "Steven G. Kargl" Date: Sat, 3 Dec 2016 07:23:13 +0000 Subject: expr.c (gfc_build_conversion): Remove unneeded initialization. 2016-12-02 Steven G. Kargl * expr.c (gfc_build_conversion): Remove unneeded initialization. From-SVN: r243217 --- gcc/fortran/expr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/gcc/fortran/expr.c b/gcc/fortran/expr.c index c01418d..3464a20 100644 --- a/gcc/fortran/expr.c +++ b/gcc/fortran/expr.c @@ -795,8 +795,6 @@ gfc_build_conversion (gfc_expr *e) p = gfc_get_expr (); p->expr_type = EXPR_FUNCTION; p->symtree = NULL; - p->value.function.actual = NULL; - p->value.function.actual = gfc_get_actual_arglist (); p->value.function.actual->expr = e; -- cgit v1.1 From 802583a210c22cdbabb63d660633af09f0039a32 Mon Sep 17 00:00:00 2001 From: Janus Weil Date: Sat, 3 Dec 2016 10:32:27 +0100 Subject: re PR fortran/58175 ([OOP] Incorrect warning message on scalar finalizer) 2016-12-03 Janus Weil PR fortran/58175 * resolve.c (gfc_resolve_finalizers): Prevent bogus warning. 2016-12-03 Janus Weil PR fortran/58175 * gfortran.dg/finalize_30.f90: Extend test case. From-SVN: r243218 --- gcc/fortran/ChangeLog | 5 +++++ gcc/fortran/resolve.c | 2 +- gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/gfortran.dg/finalize_30.f90 | 2 ++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index 68d0559..7a007c3 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,3 +1,8 @@ +2016-12-03 Janus Weil + + PR fortran/58175 + * resolve.c (gfc_resolve_finalizers): Prevent bogus warning. + 2016-12-02 Steven G. Kargl * simplify.c (gfc_convert_char_constant): Free result on error. diff --git a/gcc/fortran/resolve.c b/gcc/fortran/resolve.c index 152678f..7bc9f5f 100644 --- a/gcc/fortran/resolve.c +++ b/gcc/fortran/resolve.c @@ -12517,7 +12517,7 @@ error: /* Warn if we haven't seen a scalar finalizer procedure (but we know there were nodes in the list, must have been for arrays. It is surely a good idea to have a scalar version there if there's something to finalize. */ - if (warn_surprising && result && !seen_scalar) + if (warn_surprising && derived->f2k_derived->finalizers && !seen_scalar) gfc_warning (OPT_Wsurprising, "Only array FINAL procedures declared for derived type %qs" " defined at %L, suggest also scalar one", diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 103906b..39a5c59 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-03 Janus Weil + + PR fortran/58175 + * gfortran.dg/finalize_30.f90: Extend test case. + 2016-12-02 Jakub Jelinek PR c++/78649 diff --git a/gcc/testsuite/gfortran.dg/finalize_30.f90 b/gcc/testsuite/gfortran.dg/finalize_30.f90 index 281bfaa..b93a3d5 100644 --- a/gcc/testsuite/gfortran.dg/finalize_30.f90 +++ b/gcc/testsuite/gfortran.dg/finalize_30.f90 @@ -10,6 +10,8 @@ module ct contains final :: aD end type + type, extends(a) :: a1 + end type contains subroutine aD(self) type(a), intent(inout) :: self -- cgit v1.1 From 31cfd832864298ea34c52625312e2a0ed0478e3d Mon Sep 17 00:00:00 2001 From: Thomas Koenig Date: Sat, 3 Dec 2016 09:44:35 +0000 Subject: re PR libfortran/78379 (Processor-specific versions for matmul) 2016-12-03 Thomas Koenig PR fortran/78379 * config/i386/cpuinfo.c: Move denums for processor vendors, processor type, processor subtypes and declaration of struct __processor_model into * config/i386/cpuinfo.h: New header file. * Makefile.am: Add dependence of m4/matmul_internal_m4 to mamtul files.. * Makefile.in: Regenerated. * acinclude.m4: Check for AVX, AVX2 and AVX512F. * config.h.in: Add HAVE_AVX, HAVE_AVX2 and HAVE_AVX512F. * configure: Regenerated. * configure.ac: Use checks for AVX, AVX2 and AVX_512F. * m4/matmul_internal.m4: New file. working part of matmul.m4. * m4/matmul.m4: Implement architecture-specific switching for AVX, AVX2 and AVX512F by including matmul_internal.m4 multiple times. * generated/matmul_c10.c: Regenerated. * generated/matmul_c16.c: Regenerated. * generated/matmul_c4.c: Regenerated. * generated/matmul_c8.c: Regenerated. * generated/matmul_i1.c: Regenerated. * generated/matmul_i16.c: Regenerated. * generated/matmul_i2.c: Regenerated. * generated/matmul_i4.c: Regenerated. * generated/matmul_i8.c: Regenerated. * generated/matmul_r10.c: Regenerated. * generated/matmul_r16.c: Regenerated. * generated/matmul_r4.c: Regenerated. * generated/matmul_r8.c: Regenerated. From-SVN: r243219 --- libgcc/ChangeLog | 8 + libgcc/config/i386/cpuinfo.c | 92 +- libgcc/config/i386/cpuinfo.h | 116 ++ libgfortran/ChangeLog | 28 + libgfortran/Makefile.am | 2 +- libgfortran/Makefile.in | 2 +- libgfortran/acinclude.m4 | 51 + libgfortran/config.h.in | 9 + libgfortran/configure | 87 ++ libgfortran/configure.ac | 9 + libgfortran/generated/matmul_c10.c | 2233 +++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_c16.c | 2233 +++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_c4.c | 2233 +++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_c8.c | 2233 +++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_i1.c | 2233 +++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_i16.c | 2233 +++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_i2.c | 2233 +++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_i4.c | 2233 +++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_i8.c | 2233 +++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_r10.c | 2237 ++++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_r16.c | 2237 ++++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_r4.c | 2237 ++++++++++++++++++++++++++++++++++++ libgfortran/generated/matmul_r8.c | 2237 ++++++++++++++++++++++++++++++++++++ libgfortran/m4/matmul.m4 | 596 ++-------- libgfortran/m4/matmul_internal.m4 | 537 +++++++++ 25 files changed, 29976 insertions(+), 606 deletions(-) create mode 100644 libgcc/config/i386/cpuinfo.h create mode 100644 libgfortran/m4/matmul_internal.m4 diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog index 6627d26..efadedf 100644 --- a/libgcc/ChangeLog +++ b/libgcc/ChangeLog @@ -1,3 +1,11 @@ +2016-12-03 Thomas Koenig + + PR fortran/78379 + * config/i386/cpuinfo.c: Move denums for processor vendors, + processor type, processor subtypes and declaration of + struct __processor_model into + * config/i386/cpuinfo.h: New header file. + 2016-12-02 Andre Vieira Thomas Preud'homme diff --git a/libgcc/config/i386/cpuinfo.c b/libgcc/config/i386/cpuinfo.c index 4a0ad25..9f30cb8 100644 --- a/libgcc/config/i386/cpuinfo.c +++ b/libgcc/config/i386/cpuinfo.c @@ -26,6 +26,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #include "cpuid.h" #include "tsystem.h" #include "auto-target.h" +#include "cpuinfo.h" #ifdef HAVE_INIT_PRIORITY #define CONSTRUCTOR_PRIORITY (101) @@ -36,97 +37,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see int __cpu_indicator_init (void) __attribute__ ((constructor CONSTRUCTOR_PRIORITY)); -/* Processor Vendor and Models. */ -enum processor_vendor -{ - VENDOR_INTEL = 1, - VENDOR_AMD, - VENDOR_OTHER, - VENDOR_MAX -}; - -/* Any new types or subtypes have to be inserted at the end. */ - -enum processor_types -{ - INTEL_BONNELL = 1, - INTEL_CORE2, - INTEL_COREI7, - AMDFAM10H, - AMDFAM15H, - INTEL_SILVERMONT, - INTEL_KNL, - AMD_BTVER1, - AMD_BTVER2, - AMDFAM17H, - CPU_TYPE_MAX -}; - -enum processor_subtypes -{ - INTEL_COREI7_NEHALEM = 1, - INTEL_COREI7_WESTMERE, - INTEL_COREI7_SANDYBRIDGE, - AMDFAM10H_BARCELONA, - AMDFAM10H_SHANGHAI, - AMDFAM10H_ISTANBUL, - AMDFAM15H_BDVER1, - AMDFAM15H_BDVER2, - AMDFAM15H_BDVER3, - AMDFAM15H_BDVER4, - AMDFAM17H_ZNVER1, - INTEL_COREI7_IVYBRIDGE, - INTEL_COREI7_HASWELL, - INTEL_COREI7_BROADWELL, - INTEL_COREI7_SKYLAKE, - INTEL_COREI7_SKYLAKE_AVX512, - CPU_SUBTYPE_MAX -}; - -/* ISA Features supported. New features have to be inserted at the end. */ - -enum processor_features -{ - FEATURE_CMOV = 0, - FEATURE_MMX, - FEATURE_POPCNT, - FEATURE_SSE, - FEATURE_SSE2, - FEATURE_SSE3, - FEATURE_SSSE3, - FEATURE_SSE4_1, - FEATURE_SSE4_2, - FEATURE_AVX, - FEATURE_AVX2, - FEATURE_SSE4_A, - FEATURE_FMA4, - FEATURE_XOP, - FEATURE_FMA, - FEATURE_AVX512F, - FEATURE_BMI, - FEATURE_BMI2, - FEATURE_AES, - FEATURE_PCLMUL, - FEATURE_AVX512VL, - FEATURE_AVX512BW, - FEATURE_AVX512DQ, - FEATURE_AVX512CD, - FEATURE_AVX512ER, - FEATURE_AVX512PF, - FEATURE_AVX512VBMI, - FEATURE_AVX512IFMA, - FEATURE_AVX5124VNNIW, - FEATURE_AVX5124FMAPS -}; - -struct __processor_model -{ - unsigned int __cpu_vendor; - unsigned int __cpu_type; - unsigned int __cpu_subtype; - unsigned int __cpu_features[1]; -} __cpu_model = { }; +struct __processor_model __cpu_model = { }; /* Get the specific type of AMD CPU. */ diff --git a/libgcc/config/i386/cpuinfo.h b/libgcc/config/i386/cpuinfo.h new file mode 100644 index 0000000..cf848e6 --- /dev/null +++ b/libgcc/config/i386/cpuinfo.h @@ -0,0 +1,116 @@ +/* Get CPU type and Features for x86 processors. + Copyright (C) 2012-2016 Free Software Foundation, Inc. + Contributed by Sriraman Tallam (tmsriram@google.com) + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* Processor Vendor and Models. */ + +enum processor_vendor +{ + VENDOR_INTEL = 1, + VENDOR_AMD, + VENDOR_OTHER, + VENDOR_MAX +}; + +/* Any new types or subtypes have to be inserted at the end. */ + +enum processor_types +{ + INTEL_BONNELL = 1, + INTEL_CORE2, + INTEL_COREI7, + AMDFAM10H, + AMDFAM15H, + INTEL_SILVERMONT, + INTEL_KNL, + AMD_BTVER1, + AMD_BTVER2, + AMDFAM17H, + CPU_TYPE_MAX +}; + +enum processor_subtypes +{ + INTEL_COREI7_NEHALEM = 1, + INTEL_COREI7_WESTMERE, + INTEL_COREI7_SANDYBRIDGE, + AMDFAM10H_BARCELONA, + AMDFAM10H_SHANGHAI, + AMDFAM10H_ISTANBUL, + AMDFAM15H_BDVER1, + AMDFAM15H_BDVER2, + AMDFAM15H_BDVER3, + AMDFAM15H_BDVER4, + AMDFAM17H_ZNVER1, + INTEL_COREI7_IVYBRIDGE, + INTEL_COREI7_HASWELL, + INTEL_COREI7_BROADWELL, + INTEL_COREI7_SKYLAKE, + INTEL_COREI7_SKYLAKE_AVX512, + CPU_SUBTYPE_MAX +}; + +/* ISA Features supported. New features have to be inserted at the end. */ + +enum processor_features +{ + FEATURE_CMOV = 0, + FEATURE_MMX, + FEATURE_POPCNT, + FEATURE_SSE, + FEATURE_SSE2, + FEATURE_SSE3, + FEATURE_SSSE3, + FEATURE_SSE4_1, + FEATURE_SSE4_2, + FEATURE_AVX, + FEATURE_AVX2, + FEATURE_SSE4_A, + FEATURE_FMA4, + FEATURE_XOP, + FEATURE_FMA, + FEATURE_AVX512F, + FEATURE_BMI, + FEATURE_BMI2, + FEATURE_AES, + FEATURE_PCLMUL, + FEATURE_AVX512VL, + FEATURE_AVX512BW, + FEATURE_AVX512DQ, + FEATURE_AVX512CD, + FEATURE_AVX512ER, + FEATURE_AVX512PF, + FEATURE_AVX512VBMI, + FEATURE_AVX512IFMA, + FEATURE_AVX5124VNNIW, + FEATURE_AVX5124FMAPS +}; + +extern struct __processor_model +{ + unsigned int __cpu_vendor; + unsigned int __cpu_type; + unsigned int __cpu_subtype; + unsigned int __cpu_features[1]; +} __cpu_model; diff --git a/libgfortran/ChangeLog b/libgfortran/ChangeLog index d3966f5..03ff063 100644 --- a/libgfortran/ChangeLog +++ b/libgfortran/ChangeLog @@ -1,3 +1,31 @@ +2016-12-03 Thomas Koenig + + PR fortran/78379 + * Makefile.am: Add dependence of m4/matmul_internal_m4 to + mamtul files.. + * Makefile.in: Regenerated. + * acinclude.m4: Check for AVX, AVX2 and AVX512F. + * config.h.in: Add HAVE_AVX, HAVE_AVX2 and HAVE_AVX512F. + * configure: Regenerated. + * configure.ac: Use checks for AVX, AVX2 and AVX_512F. + * m4/matmul_internal.m4: New file. working part of matmul.m4. + * m4/matmul.m4: Implement architecture-specific switching + for AVX, AVX2 and AVX512F by including matmul_internal.m4 + multiple times. + * generated/matmul_c10.c: Regenerated. + * generated/matmul_c16.c: Regenerated. + * generated/matmul_c4.c: Regenerated. + * generated/matmul_c8.c: Regenerated. + * generated/matmul_i1.c: Regenerated. + * generated/matmul_i16.c: Regenerated. + * generated/matmul_i2.c: Regenerated. + * generated/matmul_i4.c: Regenerated. + * generated/matmul_i8.c: Regenerated. + * generated/matmul_r10.c: Regenerated. + * generated/matmul_r16.c: Regenerated. + * generated/matmul_r4.c: Regenerated. + * generated/matmul_r8.c: Regenerated. + 2016-11-30 Andre Vehreschild * caf/single.c (_gfortran_caf_get_by_ref): Prevent compile time diff --git a/libgfortran/Makefile.am b/libgfortran/Makefile.am index 3db52b8..6137d88 100644 --- a/libgfortran/Makefile.am +++ b/libgfortran/Makefile.am @@ -987,7 +987,7 @@ $(i_product_c): m4/product.m4 $(I_M4_DEPS1) $(i_sum_c): m4/sum.m4 $(I_M4_DEPS1) $(M4) -Dfile=$@ -I$(srcdir)/m4 sum.m4 > $@ -$(i_matmul_c): m4/matmul.m4 $(I_M4_DEPS) +$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS) $(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@ $(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS) diff --git a/libgfortran/Makefile.in b/libgfortran/Makefile.in index f7b34b9..4d95723 100644 --- a/libgfortran/Makefile.in +++ b/libgfortran/Makefile.in @@ -6053,7 +6053,7 @@ fpu-target.inc: fpu-target.h $(srcdir)/libgfortran.h @MAINTAINER_MODE_TRUE@$(i_sum_c): m4/sum.m4 $(I_M4_DEPS1) @MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 sum.m4 > $@ -@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 $(I_M4_DEPS) +@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS) @MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@ @MAINTAINER_MODE_TRUE@$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS) diff --git a/libgfortran/acinclude.m4 b/libgfortran/acinclude.m4 index 7280bc3..9a7f461 100644 --- a/libgfortran/acinclude.m4 +++ b/libgfortran/acinclude.m4 @@ -393,3 +393,54 @@ AC_DEFUN([LIBGFOR_CHECK_STRERROR_R], [ [Define if strerror_r takes two arguments and is available in .]),) CFLAGS="$ac_save_CFLAGS" ]) + +dnl Check for AVX + +AC_DEFUN([LIBGFOR_CHECK_AVX], [ + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + void _mm256_zeroall (void) + { + __builtin_ia32_vzeroall (); + }]], [[]])], + AC_DEFINE(HAVE_AVX, 1, + [Define if AVX instructions can be compiled.]), + []) + CFLAGS="$ac_save_CFLAGS" +]) + +dnl Check for AVX2 + +AC_DEFUN([LIBGFOR_CHECK_AVX2], [ + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx2" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + typedef long long __v4di __attribute__ ((__vector_size__ (32))); + __v4di + mm256_is32_andnotsi256 (__v4di __X, __v4di __Y) + { + return __builtin_ia32_andnotsi256 (__X, __Y); + }]], [[]])], + AC_DEFINE(HAVE_AVX2, 1, + [Define if AVX2 instructions can be compiled.]), + []) + CFLAGS="$ac_save_CFLAGS" +]) + +dnl Check for AVX512f + +AC_DEFUN([LIBGFOR_CHECK_AVX512F], [ + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx512f" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + typedef double __m512d __attribute__ ((__vector_size__ (64))); + __m512d _mm512_add (__m512d a) + { + return __builtin_ia32_addpd512_mask (a, a, a, 1, 4); + }]], [[]])], + AC_DEFINE(HAVE_AVX512F, 1, + [Define if AVX512f instructions can be compiled.]), + []) + CFLAGS="$ac_save_CFLAGS" +]) diff --git a/libgfortran/config.h.in b/libgfortran/config.h.in index 22449e6..b762d099 100644 --- a/libgfortran/config.h.in +++ b/libgfortran/config.h.in @@ -78,6 +78,15 @@ /* Define to 1 if the target supports __attribute__((visibility(...))). */ #undef HAVE_ATTRIBUTE_VISIBILITY +/* Define if AVX instructions can be compiled. */ +#undef HAVE_AVX + +/* Define if AVX2 instructions can be compiled. */ +#undef HAVE_AVX2 + +/* Define if AVX512f instructions can be compiled. */ +#undef HAVE_AVX512F + /* Define to 1 if you have the `cabs' function. */ #undef HAVE_CABS diff --git a/libgfortran/configure b/libgfortran/configure index c052027..45ef935 100755 --- a/libgfortran/configure +++ b/libgfortran/configure @@ -26174,6 +26174,93 @@ $as_echo "#define HAVE_CRLF 1" >>confdefs.h fi +# Check whether we support AVX extensions + + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + void _mm256_zeroall (void) + { + __builtin_ia32_vzeroall (); + } +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +$as_echo "#define HAVE_AVX 1" >>confdefs.h + +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + CFLAGS="$ac_save_CFLAGS" + + +# Check wether we support AVX2 extensions + + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx2" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + typedef long long __v4di __attribute__ ((__vector_size__ (32))); + __v4di + mm256_is32_andnotsi256 (__v4di __X, __v4di __Y) + { + return __builtin_ia32_andnotsi256 (__X, __Y); + } +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +$as_echo "#define HAVE_AVX2 1" >>confdefs.h + +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + CFLAGS="$ac_save_CFLAGS" + + +# Check wether we support AVX512f extensions + + ac_save_CFLAGS="$CFLAGS" + CFLAGS="-O2 -mavx512f" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + typedef double __m512d __attribute__ ((__vector_size__ (64))); + __m512d _mm512_add (__m512d a) + { + return __builtin_ia32_addpd512_mask (a, a, a, 1, 4); + } +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +$as_echo "#define HAVE_AVX512F 1" >>confdefs.h + +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + CFLAGS="$ac_save_CFLAGS" + + cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure # tests run on this system so they can be shared between configure diff --git a/libgfortran/configure.ac b/libgfortran/configure.ac index 3de737d..bb84baf 100644 --- a/libgfortran/configure.ac +++ b/libgfortran/configure.ac @@ -609,6 +609,15 @@ LIBGFOR_CHECK_UNLINK_OPEN_FILE # Check whether line terminator is LF or CRLF LIBGFOR_CHECK_CRLF +# Check whether we support AVX extensions +LIBGFOR_CHECK_AVX + +# Check wether we support AVX2 extensions +LIBGFOR_CHECK_AVX2 + +# Check wether we support AVX512f extensions +LIBGFOR_CHECK_AVX512F + AC_CACHE_SAVE if test ${multilib} = yes; then diff --git a/libgfortran/generated/matmul_c10.c b/libgfortran/generated/matmul_c10.c index c784a26..bf40e37 100644 --- a/libgfortran/generated/matmul_c10.c +++ b/libgfortran/generated/matmul_c10.c @@ -75,6 +75,2233 @@ extern void matmul_c10 (gfc_array_c10 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_c10); + + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_c10_avx (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_c10_avx (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_10 * restrict abase; + const GFC_COMPLEX_10 * restrict bbase; + GFC_COMPLEX_10 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_10 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_10 *a, *b; + GFC_COMPLEX_10 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_10 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_10)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_10 *restrict abase_x; + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 *restrict dest_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_10 *restrict abase_x; + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 *restrict dest_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_10 * restrict abase; + const GFC_COMPLEX_10 * restrict bbase; + GFC_COMPLEX_10 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_10 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_10 *a, *b; + GFC_COMPLEX_10 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_10 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_10)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_10 *restrict abase_x; + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 *restrict dest_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_10 *restrict abase_x; + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 *restrict dest_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_c10_avx512f (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_c10_avx512f (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_10 * restrict abase; + const GFC_COMPLEX_10 * restrict bbase; + GFC_COMPLEX_10 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_10 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_10 *a, *b; + GFC_COMPLEX_10 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_10 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_10)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_10 *restrict abase_x; + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 *restrict dest_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_10 *restrict abase_x; + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 *restrict dest_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_c10_vanilla (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_10 * restrict abase; + const GFC_COMPLEX_10 * restrict bbase; + GFC_COMPLEX_10 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_10)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_10 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_10 *a, *b; + GFC_COMPLEX_10 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_10 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_10)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_10 *restrict abase_x; + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 *restrict dest_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_10)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_10 *restrict abase_x; + const GFC_COMPLEX_10 *restrict bbase_y; + GFC_COMPLEX_10 *restrict dest_y; + GFC_COMPLEX_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_c10 (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_c10_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_c10_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_c10_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_c10_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_c10 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, @@ -607,4 +2834,10 @@ matmul_c10 (gfc_array_c10 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_c16.c b/libgfortran/generated/matmul_c16.c index 47e1bea..6e4545d 100644 --- a/libgfortran/generated/matmul_c16.c +++ b/libgfortran/generated/matmul_c16.c @@ -75,6 +75,2233 @@ extern void matmul_c16 (gfc_array_c16 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_c16); + + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_c16_avx (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_c16_avx (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_16 * restrict abase; + const GFC_COMPLEX_16 * restrict bbase; + GFC_COMPLEX_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_16 *a, *b; + GFC_COMPLEX_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_16 *restrict abase_x; + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 *restrict dest_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_16 *restrict abase_x; + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 *restrict dest_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_16 * restrict abase; + const GFC_COMPLEX_16 * restrict bbase; + GFC_COMPLEX_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_16 *a, *b; + GFC_COMPLEX_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_16 *restrict abase_x; + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 *restrict dest_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_16 *restrict abase_x; + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 *restrict dest_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_c16_avx512f (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_c16_avx512f (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_16 * restrict abase; + const GFC_COMPLEX_16 * restrict bbase; + GFC_COMPLEX_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_16 *a, *b; + GFC_COMPLEX_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_16 *restrict abase_x; + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 *restrict dest_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_16 *restrict abase_x; + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 *restrict dest_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_c16_vanilla (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_16 * restrict abase; + const GFC_COMPLEX_16 * restrict bbase; + GFC_COMPLEX_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_16 *a, *b; + GFC_COMPLEX_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_16 *restrict abase_x; + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 *restrict dest_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_16 *restrict abase_x; + const GFC_COMPLEX_16 *restrict bbase_y; + GFC_COMPLEX_16 *restrict dest_y; + GFC_COMPLEX_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_c16 (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_c16_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_c16_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_c16_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_c16_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_c16 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, @@ -607,4 +2834,10 @@ matmul_c16 (gfc_array_c16 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_c4.c b/libgfortran/generated/matmul_c4.c index 4eb1896..6f7d5c2 100644 --- a/libgfortran/generated/matmul_c4.c +++ b/libgfortran/generated/matmul_c4.c @@ -75,6 +75,2233 @@ extern void matmul_c4 (gfc_array_c4 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_c4); + + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_c4_avx (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_c4_avx (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_4 * restrict abase; + const GFC_COMPLEX_4 * restrict bbase; + GFC_COMPLEX_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_4 *a, *b; + GFC_COMPLEX_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_4 *restrict abase_x; + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 *restrict dest_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_4 *restrict abase_x; + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 *restrict dest_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_4 * restrict abase; + const GFC_COMPLEX_4 * restrict bbase; + GFC_COMPLEX_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_4 *a, *b; + GFC_COMPLEX_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_4 *restrict abase_x; + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 *restrict dest_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_4 *restrict abase_x; + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 *restrict dest_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_c4_avx512f (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_c4_avx512f (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_4 * restrict abase; + const GFC_COMPLEX_4 * restrict bbase; + GFC_COMPLEX_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_4 *a, *b; + GFC_COMPLEX_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_4 *restrict abase_x; + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 *restrict dest_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_4 *restrict abase_x; + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 *restrict dest_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_c4_vanilla (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_4 * restrict abase; + const GFC_COMPLEX_4 * restrict bbase; + GFC_COMPLEX_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_4 *a, *b; + GFC_COMPLEX_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_4 *restrict abase_x; + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 *restrict dest_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_4 *restrict abase_x; + const GFC_COMPLEX_4 *restrict bbase_y; + GFC_COMPLEX_4 *restrict dest_y; + GFC_COMPLEX_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_c4 (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_c4_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_c4_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_c4_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_c4_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_c4 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, @@ -607,4 +2834,10 @@ matmul_c4 (gfc_array_c4 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_c8.c b/libgfortran/generated/matmul_c8.c index 2321b9e..06916c3 100644 --- a/libgfortran/generated/matmul_c8.c +++ b/libgfortran/generated/matmul_c8.c @@ -75,6 +75,2233 @@ extern void matmul_c8 (gfc_array_c8 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_c8); + + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_c8_avx (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_c8_avx (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_8 * restrict abase; + const GFC_COMPLEX_8 * restrict bbase; + GFC_COMPLEX_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_8 *a, *b; + GFC_COMPLEX_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_8 *restrict abase_x; + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 *restrict dest_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_8 *restrict abase_x; + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 *restrict dest_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_8 * restrict abase; + const GFC_COMPLEX_8 * restrict bbase; + GFC_COMPLEX_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_8 *a, *b; + GFC_COMPLEX_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_8 *restrict abase_x; + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 *restrict dest_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_8 *restrict abase_x; + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 *restrict dest_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_c8_avx512f (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_c8_avx512f (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_8 * restrict abase; + const GFC_COMPLEX_8 * restrict bbase; + GFC_COMPLEX_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_8 *a, *b; + GFC_COMPLEX_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_8 *restrict abase_x; + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 *restrict dest_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_8 *restrict abase_x; + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 *restrict dest_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_c8_vanilla (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_COMPLEX_8 * restrict abase; + const GFC_COMPLEX_8 * restrict bbase; + GFC_COMPLEX_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_COMPLEX_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_COMPLEX_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_COMPLEX_8 *a, *b; + GFC_COMPLEX_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_COMPLEX_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_COMPLEX_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_COMPLEX_8 *restrict abase_x; + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 *restrict dest_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_COMPLEX_8 *restrict abase_x; + const GFC_COMPLEX_8 *restrict bbase_y; + GFC_COMPLEX_8 *restrict dest_y; + GFC_COMPLEX_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_COMPLEX_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_c8 (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_c8_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_c8_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_c8_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_c8_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_c8 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, @@ -607,4 +2834,10 @@ matmul_c8 (gfc_array_c8 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_i1.c b/libgfortran/generated/matmul_i1.c index 81c067b..2cce9d1 100644 --- a/libgfortran/generated/matmul_i1.c +++ b/libgfortran/generated/matmul_i1.c @@ -75,6 +75,2233 @@ extern void matmul_i1 (gfc_array_i1 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_i1); + + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_i1_avx (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_i1_avx (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_1 * restrict abase; + const GFC_INTEGER_1 * restrict bbase; + GFC_INTEGER_1 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_1 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_1 *a, *b; + GFC_INTEGER_1 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_1 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_1)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_1 *restrict abase_x; + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 *restrict dest_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_1 *restrict abase_x; + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 *restrict dest_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_1 * restrict abase; + const GFC_INTEGER_1 * restrict bbase; + GFC_INTEGER_1 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_1 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_1 *a, *b; + GFC_INTEGER_1 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_1 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_1)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_1 *restrict abase_x; + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 *restrict dest_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_1 *restrict abase_x; + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 *restrict dest_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_i1_avx512f (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_1 * restrict abase; + const GFC_INTEGER_1 * restrict bbase; + GFC_INTEGER_1 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_1 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_1 *a, *b; + GFC_INTEGER_1 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_1 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_1)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_1 *restrict abase_x; + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 *restrict dest_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_1 *restrict abase_x; + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 *restrict dest_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_1 * restrict abase; + const GFC_INTEGER_1 * restrict bbase; + GFC_INTEGER_1 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_1)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_1 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_1 *a, *b; + GFC_INTEGER_1 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_1 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_1)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_1 *restrict abase_x; + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 *restrict dest_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_1)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_1 *restrict abase_x; + const GFC_INTEGER_1 *restrict bbase_y; + GFC_INTEGER_1 *restrict dest_y; + GFC_INTEGER_1 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_1) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_i1 (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_i1_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_i1_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_i1_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_i1_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_i1 (gfc_array_i1 * const restrict retarray, gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, @@ -607,4 +2834,10 @@ matmul_i1 (gfc_array_i1 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_i16.c b/libgfortran/generated/matmul_i16.c index d1b1761..76a605f 100644 --- a/libgfortran/generated/matmul_i16.c +++ b/libgfortran/generated/matmul_i16.c @@ -75,6 +75,2233 @@ extern void matmul_i16 (gfc_array_i16 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_i16); + + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_i16_avx (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_i16_avx (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_16 * restrict abase; + const GFC_INTEGER_16 * restrict bbase; + GFC_INTEGER_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_16 *a, *b; + GFC_INTEGER_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_16 *restrict abase_x; + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 *restrict dest_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_16 *restrict abase_x; + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 *restrict dest_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_16 * restrict abase; + const GFC_INTEGER_16 * restrict bbase; + GFC_INTEGER_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_16 *a, *b; + GFC_INTEGER_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_16 *restrict abase_x; + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 *restrict dest_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_16 *restrict abase_x; + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 *restrict dest_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_i16_avx512f (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_16 * restrict abase; + const GFC_INTEGER_16 * restrict bbase; + GFC_INTEGER_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_16 *a, *b; + GFC_INTEGER_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_16 *restrict abase_x; + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 *restrict dest_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_16 *restrict abase_x; + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 *restrict dest_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_16 * restrict abase; + const GFC_INTEGER_16 * restrict bbase; + GFC_INTEGER_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_16 *a, *b; + GFC_INTEGER_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_16 *restrict abase_x; + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 *restrict dest_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_16 *restrict abase_x; + const GFC_INTEGER_16 *restrict bbase_y; + GFC_INTEGER_16 *restrict dest_y; + GFC_INTEGER_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_i16 (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_i16_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_i16_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_i16_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_i16_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_i16 (gfc_array_i16 * const restrict retarray, gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, @@ -607,4 +2834,10 @@ matmul_i16 (gfc_array_i16 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_i2.c b/libgfortran/generated/matmul_i2.c index 5a06fcc..324197a 100644 --- a/libgfortran/generated/matmul_i2.c +++ b/libgfortran/generated/matmul_i2.c @@ -75,6 +75,2233 @@ extern void matmul_i2 (gfc_array_i2 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_i2); + + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_i2_avx (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_i2_avx (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_2 * restrict abase; + const GFC_INTEGER_2 * restrict bbase; + GFC_INTEGER_2 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_2 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_2 *a, *b; + GFC_INTEGER_2 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_2 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_2)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_2 *restrict abase_x; + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 *restrict dest_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_2 *restrict abase_x; + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 *restrict dest_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_2 * restrict abase; + const GFC_INTEGER_2 * restrict bbase; + GFC_INTEGER_2 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_2 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_2 *a, *b; + GFC_INTEGER_2 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_2 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_2)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_2 *restrict abase_x; + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 *restrict dest_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_2 *restrict abase_x; + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 *restrict dest_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_i2_avx512f (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_2 * restrict abase; + const GFC_INTEGER_2 * restrict bbase; + GFC_INTEGER_2 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_2 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_2 *a, *b; + GFC_INTEGER_2 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_2 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_2)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_2 *restrict abase_x; + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 *restrict dest_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_2 *restrict abase_x; + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 *restrict dest_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_2 * restrict abase; + const GFC_INTEGER_2 * restrict bbase; + GFC_INTEGER_2 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_2)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_2 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_2 *a, *b; + GFC_INTEGER_2 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_2 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_2)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_2 *restrict abase_x; + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 *restrict dest_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_2)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_2 *restrict abase_x; + const GFC_INTEGER_2 *restrict bbase_y; + GFC_INTEGER_2 *restrict dest_y; + GFC_INTEGER_2 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_2) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_i2 (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_i2_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_i2_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_i2_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_i2_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_i2 (gfc_array_i2 * const restrict retarray, gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, @@ -607,4 +2834,10 @@ matmul_i2 (gfc_array_i2 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_i4.c b/libgfortran/generated/matmul_i4.c index aee8e4d..bd31c7c 100644 --- a/libgfortran/generated/matmul_i4.c +++ b/libgfortran/generated/matmul_i4.c @@ -75,6 +75,2233 @@ extern void matmul_i4 (gfc_array_i4 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_i4); + + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_i4_avx (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_i4_avx (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_4 * restrict abase; + const GFC_INTEGER_4 * restrict bbase; + GFC_INTEGER_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_4 *a, *b; + GFC_INTEGER_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_4 *restrict abase_x; + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 *restrict dest_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_4 *restrict abase_x; + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 *restrict dest_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_4 * restrict abase; + const GFC_INTEGER_4 * restrict bbase; + GFC_INTEGER_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_4 *a, *b; + GFC_INTEGER_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_4 *restrict abase_x; + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 *restrict dest_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_4 *restrict abase_x; + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 *restrict dest_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_i4_avx512f (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_4 * restrict abase; + const GFC_INTEGER_4 * restrict bbase; + GFC_INTEGER_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_4 *a, *b; + GFC_INTEGER_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_4 *restrict abase_x; + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 *restrict dest_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_4 *restrict abase_x; + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 *restrict dest_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_4 * restrict abase; + const GFC_INTEGER_4 * restrict bbase; + GFC_INTEGER_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_4 *a, *b; + GFC_INTEGER_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_4 *restrict abase_x; + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 *restrict dest_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_4 *restrict abase_x; + const GFC_INTEGER_4 *restrict bbase_y; + GFC_INTEGER_4 *restrict dest_y; + GFC_INTEGER_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_i4 (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_i4_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_i4_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_i4_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_i4_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_i4 (gfc_array_i4 * const restrict retarray, gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, @@ -607,4 +2834,10 @@ matmul_i4 (gfc_array_i4 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_i8.c b/libgfortran/generated/matmul_i8.c index 902b284..c4d0327 100644 --- a/libgfortran/generated/matmul_i8.c +++ b/libgfortran/generated/matmul_i8.c @@ -75,6 +75,2233 @@ extern void matmul_i8 (gfc_array_i8 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_i8); + + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_i8_avx (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_i8_avx (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_8 * restrict abase; + const GFC_INTEGER_8 * restrict bbase; + GFC_INTEGER_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_8 *a, *b; + GFC_INTEGER_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_8 *restrict abase_x; + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 *restrict dest_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_8 *restrict abase_x; + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 *restrict dest_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_8 * restrict abase; + const GFC_INTEGER_8 * restrict bbase; + GFC_INTEGER_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_8 *a, *b; + GFC_INTEGER_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_8 *restrict abase_x; + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 *restrict dest_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_8 *restrict abase_x; + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 *restrict dest_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_i8_avx512f (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_8 * restrict abase; + const GFC_INTEGER_8 * restrict bbase; + GFC_INTEGER_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_8 *a, *b; + GFC_INTEGER_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_8 *restrict abase_x; + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 *restrict dest_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_8 *restrict abase_x; + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 *restrict dest_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_INTEGER_8 * restrict abase; + const GFC_INTEGER_8 * restrict bbase; + GFC_INTEGER_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_INTEGER_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_INTEGER_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_INTEGER_8 *a, *b; + GFC_INTEGER_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_INTEGER_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_INTEGER_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_INTEGER_8 *restrict abase_x; + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 *restrict dest_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_INTEGER_8 *restrict abase_x; + const GFC_INTEGER_8 *restrict bbase_y; + GFC_INTEGER_8 *restrict dest_y; + GFC_INTEGER_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_INTEGER_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_i8 (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_i8_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_i8_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_i8_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_i8_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_i8 (gfc_array_i8 * const restrict retarray, gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, @@ -607,4 +2834,10 @@ matmul_i8 (gfc_array_i8 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_r10.c b/libgfortran/generated/matmul_r10.c index 8bb1e62..b9fb10b 100644 --- a/libgfortran/generated/matmul_r10.c +++ b/libgfortran/generated/matmul_r10.c @@ -75,6 +75,2237 @@ extern void matmul_r10 (gfc_array_r10 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_r10); +#if defined(HAVE_AVX) && defined(HAVE_AVX2) +/* REAL types generate identical code for AVX and AVX2. Only generate + an AVX2 function if we are dealing with integer. */ +#undef HAVE_AVX2 +#endif + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_r10_avx (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_r10_avx (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_10 * restrict abase; + const GFC_REAL_10 * restrict bbase; + GFC_REAL_10 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_10 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_10 *a, *b; + GFC_REAL_10 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_10 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_10)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_10 *restrict abase_x; + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 *restrict dest_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_10)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_10 *restrict abase_x; + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 *restrict dest_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_10 * restrict abase; + const GFC_REAL_10 * restrict bbase; + GFC_REAL_10 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_10 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_10 *a, *b; + GFC_REAL_10 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_10 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_10)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_10 *restrict abase_x; + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 *restrict dest_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_10)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_10 *restrict abase_x; + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 *restrict dest_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_r10_avx512f (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_r10_avx512f (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_10 * restrict abase; + const GFC_REAL_10 * restrict bbase; + GFC_REAL_10 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_10 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_10 *a, *b; + GFC_REAL_10 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_10 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_10)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_10 *restrict abase_x; + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 *restrict dest_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_10)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_10 *restrict abase_x; + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 *restrict dest_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_r10_vanilla (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_10 * restrict abase; + const GFC_REAL_10 * restrict bbase; + GFC_REAL_10 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_10)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_10 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_10 *a, *b; + GFC_REAL_10 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_10 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_10)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_10 *restrict abase_x; + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 *restrict dest_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_10)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_10 *restrict abase_x; + const GFC_REAL_10 *restrict bbase_y; + GFC_REAL_10 *restrict dest_y; + GFC_REAL_10 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_10) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_r10 (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_r10_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_r10_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_r10_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_r10_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_r10 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, @@ -607,4 +2838,10 @@ matmul_r10 (gfc_array_r10 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_r16.c b/libgfortran/generated/matmul_r16.c index 4ebd104..65ac801 100644 --- a/libgfortran/generated/matmul_r16.c +++ b/libgfortran/generated/matmul_r16.c @@ -75,6 +75,2237 @@ extern void matmul_r16 (gfc_array_r16 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_r16); +#if defined(HAVE_AVX) && defined(HAVE_AVX2) +/* REAL types generate identical code for AVX and AVX2. Only generate + an AVX2 function if we are dealing with integer. */ +#undef HAVE_AVX2 +#endif + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_r16_avx (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_r16_avx (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_16 * restrict abase; + const GFC_REAL_16 * restrict bbase; + GFC_REAL_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_16 *a, *b; + GFC_REAL_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_16 *restrict abase_x; + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 *restrict dest_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_16 *restrict abase_x; + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 *restrict dest_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_16 * restrict abase; + const GFC_REAL_16 * restrict bbase; + GFC_REAL_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_16 *a, *b; + GFC_REAL_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_16 *restrict abase_x; + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 *restrict dest_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_16 *restrict abase_x; + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 *restrict dest_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_r16_avx512f (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_r16_avx512f (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_16 * restrict abase; + const GFC_REAL_16 * restrict bbase; + GFC_REAL_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_16 *a, *b; + GFC_REAL_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_16 *restrict abase_x; + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 *restrict dest_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_16 *restrict abase_x; + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 *restrict dest_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_r16_vanilla (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_16 * restrict abase; + const GFC_REAL_16 * restrict bbase; + GFC_REAL_16 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_16)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_16 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_16 *a, *b; + GFC_REAL_16 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_16 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_16)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_16 *restrict abase_x; + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 *restrict dest_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_16)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_16 *restrict abase_x; + const GFC_REAL_16 *restrict bbase_y; + GFC_REAL_16 *restrict dest_y; + GFC_REAL_16 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_16) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_r16 (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_r16_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_r16_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_r16_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_r16_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_r16 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, @@ -607,4 +2838,10 @@ matmul_r16 (gfc_array_r16 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_r4.c b/libgfortran/generated/matmul_r4.c index cf3ffa3..2a85d6b 100644 --- a/libgfortran/generated/matmul_r4.c +++ b/libgfortran/generated/matmul_r4.c @@ -75,6 +75,2237 @@ extern void matmul_r4 (gfc_array_r4 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_r4); +#if defined(HAVE_AVX) && defined(HAVE_AVX2) +/* REAL types generate identical code for AVX and AVX2. Only generate + an AVX2 function if we are dealing with integer. */ +#undef HAVE_AVX2 +#endif + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_r4_avx (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_r4_avx (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_4 * restrict abase; + const GFC_REAL_4 * restrict bbase; + GFC_REAL_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_4 *a, *b; + GFC_REAL_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_4 *restrict abase_x; + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 *restrict dest_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_4 *restrict abase_x; + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 *restrict dest_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_4 * restrict abase; + const GFC_REAL_4 * restrict bbase; + GFC_REAL_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_4 *a, *b; + GFC_REAL_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_4 *restrict abase_x; + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 *restrict dest_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_4 *restrict abase_x; + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 *restrict dest_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_r4_avx512f (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_r4_avx512f (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_4 * restrict abase; + const GFC_REAL_4 * restrict bbase; + GFC_REAL_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_4 *a, *b; + GFC_REAL_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_4 *restrict abase_x; + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 *restrict dest_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_4 *restrict abase_x; + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 *restrict dest_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_r4_vanilla (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_4 * restrict abase; + const GFC_REAL_4 * restrict bbase; + GFC_REAL_4 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_4 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_4 *a, *b; + GFC_REAL_4 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_4 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_4)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_4 *restrict abase_x; + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 *restrict dest_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_4 *restrict abase_x; + const GFC_REAL_4 *restrict bbase_y; + GFC_REAL_4 *restrict dest_y; + GFC_REAL_4 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_4) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_r4 (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_r4_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_r4_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_r4_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_r4_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_r4 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, @@ -607,4 +2838,10 @@ matmul_r4 (gfc_array_r4 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/generated/matmul_r8.c b/libgfortran/generated/matmul_r8.c index 9a70a23..78bf52e 100644 --- a/libgfortran/generated/matmul_r8.c +++ b/libgfortran/generated/matmul_r8.c @@ -75,6 +75,2237 @@ extern void matmul_r8 (gfc_array_r8 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_r8); +#if defined(HAVE_AVX) && defined(HAVE_AVX2) +/* REAL types generate identical code for AVX and AVX2. Only generate + an AVX2 function if we are dealing with integer. */ +#undef HAVE_AVX2 +#endif + + +/* Put exhaustive list of possible architectures here here, ORed together. */ + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + +#ifdef HAVE_AVX +static void +matmul_r8_avx (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static void +matmul_r8_avx (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_8 * restrict abase; + const GFC_REAL_8 * restrict bbase; + GFC_REAL_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_8 *a, *b; + GFC_REAL_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_8 *restrict abase_x; + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 *restrict dest_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_8 *restrict abase_x; + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 *restrict dest_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +static void +matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static void +matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_8 * restrict abase; + const GFC_REAL_8 * restrict bbase; + GFC_REAL_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_8 *a, *b; + GFC_REAL_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_8 *restrict abase_x; + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 *restrict dest_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_8 *restrict abase_x; + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 *restrict dest_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +static void +matmul_r8_avx512f (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static void +matmul_r8_avx512f (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_8 * restrict abase; + const GFC_REAL_8 * restrict bbase; + GFC_REAL_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_8 *a, *b; + GFC_REAL_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_8 *restrict abase_x; + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 *restrict dest_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_8 *restrict abase_x; + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 *restrict dest_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + +#endif /* HAVE_AVX512F */ + +/* Function to fall back to if there is no special processor-specific version. */ +static void +matmul_r8_vanilla (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const GFC_REAL_8 * restrict abase; + const GFC_REAL_8 * restrict bbase; + GFC_REAL_8 * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_8)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } + + + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const GFC_REAL_8 one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const GFC_REAL_8 *a, *b; + GFC_REAL_8 *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + GFC_REAL_8 t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = (GFC_REAL_8)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const GFC_REAL_8 *restrict abase_x; + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 *restrict dest_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_8)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const GFC_REAL_8 *restrict abase_x; + const GFC_REAL_8 *restrict bbase_y; + GFC_REAL_8 *restrict dest_y; + GFC_REAL_8 s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = (GFC_REAL_8) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max + + +/* Compiling main function, with selection code for the processor. */ + +/* Currently, this is i386 only. Adjust for other architectures. */ + +#include +void matmul_r8 (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + if (matmul_p == NULL) + { + matmul_p = matmul_r8_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) + { + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) + { + matmul_p = matmul_r8_avx512f; + goto tailcall; + } + +#endif /* HAVE_AVX512F */ + +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + { + matmul_p = matmul_r8_avx2; + goto tailcall; + } + +#endif + +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_r8_avx; + goto tailcall; + } +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + void matmul_r8 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, @@ -607,4 +2838,10 @@ matmul_r8 (gfc_array_r8 * const restrict retarray, } } } +#undef POW3 +#undef min +#undef max + #endif +#endif + diff --git a/libgfortran/m4/matmul.m4 b/libgfortran/m4/matmul.m4 index 77ed440..4e5bf60 100644 --- a/libgfortran/m4/matmul.m4 +++ b/libgfortran/m4/matmul.m4 @@ -76,537 +76,105 @@ extern void matmul_'rtype_code` ('rtype` * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_'rtype_code`); -void -matmul_'rtype_code` ('rtype` * const restrict retarray, - 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, - int blas_limit, blas_call gemm) -{ - const 'rtype_name` * restrict abase; - const 'rtype_name` * restrict bbase; - 'rtype_name` * restrict dest; - - index_type rxstride, rystride, axstride, aystride, bxstride, bystride; - index_type x, y, n, count, xcount, ycount; - - assert (GFC_DESCRIPTOR_RANK (a) == 2 - || GFC_DESCRIPTOR_RANK (b) == 2); - -/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] - - Either A or B (but not both) can be rank 1: - - o One-dimensional argument A is implicitly treated as a row matrix - dimensioned [1,count], so xcount=1. - - o One-dimensional argument B is implicitly treated as a column matrix - dimensioned [count, 1], so ycount=1. -*/ +'ifelse(rtype_letter,`r',dnl +`#if defined(HAVE_AVX) && defined(HAVE_AVX2) +/* REAL types generate identical code for AVX and AVX2. Only generate + an AVX2 function if we are dealing with integer. */ +#undef HAVE_AVX2 +#endif') +` - if (retarray->base_addr == NULL) - { - if (GFC_DESCRIPTOR_RANK (a) == 1) - { - GFC_DIMENSION_SET(retarray->dim[0], 0, - GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); - } - else if (GFC_DESCRIPTOR_RANK (b) == 1) - { - GFC_DIMENSION_SET(retarray->dim[0], 0, - GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); - } - else - { - GFC_DIMENSION_SET(retarray->dim[0], 0, - GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); - - GFC_DIMENSION_SET(retarray->dim[1], 0, - GFC_DESCRIPTOR_EXTENT(b,1) - 1, - GFC_DESCRIPTOR_EXTENT(retarray,0)); - } +/* Put exhaustive list of possible architectures here here, ORed together. */ - retarray->base_addr - = xmallocarray (size0 ((array_t *) retarray), sizeof ('rtype_name`)); - retarray->offset = 0; - } - else if (unlikely (compile_options.bounds_check)) - { - index_type ret_extent, arg_extent; +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) - if (GFC_DESCRIPTOR_RANK (a) == 1) - { - arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); - ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); - if (arg_extent != ret_extent) - runtime_error ("Incorrect extent in return array in" - " MATMUL intrinsic: is %ld, should be %ld", - (long int) ret_extent, (long int) arg_extent); - } - else if (GFC_DESCRIPTOR_RANK (b) == 1) - { - arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); - ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); - if (arg_extent != ret_extent) - runtime_error ("Incorrect extent in return array in" - " MATMUL intrinsic: is %ld, should be %ld", - (long int) ret_extent, (long int) arg_extent); - } - else - { - arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); - ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); - if (arg_extent != ret_extent) - runtime_error ("Incorrect extent in return array in" - " MATMUL intrinsic for dimension 1:" - " is %ld, should be %ld", - (long int) ret_extent, (long int) arg_extent); - - arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); - ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); - if (arg_extent != ret_extent) - runtime_error ("Incorrect extent in return array in" - " MATMUL intrinsic for dimension 2:" - " is %ld, should be %ld", - (long int) ret_extent, (long int) arg_extent); - } - } -' -sinclude(`matmul_asm_'rtype_code`.m4')dnl -` - if (GFC_DESCRIPTOR_RANK (retarray) == 1) - { - /* One-dimensional result may be addressed in the code below - either as a row or a column matrix. We want both cases to - work. */ - rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); - } - else - { - rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); - rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); - } +#ifdef HAVE_AVX +'define(`matmul_name',`matmul_'rtype_code`_avx')dnl +`static void +'matmul_name` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx"))); +static' include(matmul_internal.m4)dnl +`#endif /* HAVE_AVX */ + +#ifdef HAVE_AVX2 +'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl +`static void +'matmul_name` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); +static' include(matmul_internal.m4)dnl +`#endif /* HAVE_AVX2 */ + +#ifdef HAVE_AVX512F +'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl +`static void +'matmul_name` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) __attribute__((__target__("avx512f"))); +static' include(matmul_internal.m4)dnl +`#endif /* HAVE_AVX512F */ +/* Function to fall back to if there is no special processor-specific version. */ +'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl +`static' include(matmul_internal.m4)dnl - if (GFC_DESCRIPTOR_RANK (a) == 1) - { - /* Treat it as a a row matrix A[1,count]. */ - axstride = GFC_DESCRIPTOR_STRIDE(a,0); - aystride = 1; - - xcount = 1; - count = GFC_DESCRIPTOR_EXTENT(a,0); - } - else - { - axstride = GFC_DESCRIPTOR_STRIDE(a,0); - aystride = GFC_DESCRIPTOR_STRIDE(a,1); +`/* Compiling main function, with selection code for the processor. */ - count = GFC_DESCRIPTOR_EXTENT(a,1); - xcount = GFC_DESCRIPTOR_EXTENT(a,0); - } +/* Currently, this is i386 only. Adjust for other architectures. */ - if (count != GFC_DESCRIPTOR_EXTENT(b,0)) - { - if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) - runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); - } - - if (GFC_DESCRIPTOR_RANK (b) == 1) - { - /* Treat it as a column matrix B[count,1] */ - bxstride = GFC_DESCRIPTOR_STRIDE(b,0); - - /* bystride should never be used for 1-dimensional b. - in case it is we want it to cause a segfault, rather than - an incorrect result. */ - bystride = 0xDEADBEEF; - ycount = 1; - } - else - { - bxstride = GFC_DESCRIPTOR_STRIDE(b,0); - bystride = GFC_DESCRIPTOR_STRIDE(b,1); - ycount = GFC_DESCRIPTOR_EXTENT(b,1); - } - - abase = a->base_addr; - bbase = b->base_addr; - dest = retarray->base_addr; - - /* Now that everything is set up, we perform the multiplication - itself. */ - -#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) -#define min(a,b) ((a) <= (b) ? (a) : (b)) -#define max(a,b) ((a) >= (b) ? (a) : (b)) - - if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) - && (bxstride == 1 || bystride == 1) - && (((float) xcount) * ((float) ycount) * ((float) count) - > POW3(blas_limit))) - { - const int m = xcount, n = ycount, k = count, ldc = rystride; - const 'rtype_name` one = 1, zero = 0; - const int lda = (axstride == 1) ? aystride : axstride, - ldb = (bxstride == 1) ? bystride : bxstride; +#include +void matmul_'rtype_code` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + static void (*matmul_p) ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; - if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) - { - assert (gemm != NULL); - gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, - &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, - &ldc, 1, 1); - return; - } - } - - if (rxstride == 1 && axstride == 1 && bxstride == 1) + if (matmul_p == NULL) { - /* This block of code implements a tuned matmul, derived from - Superscalar GEMM-based level 3 BLAS, Beta version 0.1 - - Bo Kagstrom and Per Ling - Department of Computing Science - Umea University - S-901 87 Umea, Sweden - - from netlib.org, translated to C, and modified for matmul.m4. */ - - const 'rtype_name` *a, *b; - 'rtype_name` *c; - const index_type m = xcount, n = ycount, k = count; - - /* System generated locals */ - index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, - i1, i2, i3, i4, i5, i6; - - /* Local variables */ - 'rtype_name` t1[65536], /* was [256][256] */ - f11, f12, f21, f22, f31, f32, f41, f42, - f13, f14, f23, f24, f33, f34, f43, f44; - index_type i, j, l, ii, jj, ll; - index_type isec, jsec, lsec, uisec, ujsec, ulsec; - - a = abase; - b = bbase; - c = retarray->base_addr; - - /* Parameter adjustments */ - c_dim1 = rystride; - c_offset = 1 + c_dim1; - c -= c_offset; - a_dim1 = aystride; - a_offset = 1 + a_dim1; - a -= a_offset; - b_dim1 = bystride; - b_offset = 1 + b_dim1; - b -= b_offset; - - /* Early exit if possible */ - if (m == 0 || n == 0 || k == 0) - return; - - /* Empty c first. */ - for (j=1; j<=n; j++) - for (i=1; i<=m; i++) - c[i + j * c_dim1] = ('rtype_name`)0; - - /* Start turning the crank. */ - i1 = n; - for (jj = 1; jj <= i1; jj += 512) + matmul_p = matmul_'rtype_code`_vanilla; + if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { - /* Computing MIN */ - i2 = 512; - i3 = n - jj + 1; - jsec = min(i2,i3); - ujsec = jsec - jsec % 4; - i2 = k; - for (ll = 1; ll <= i2; ll += 256) + /* Run down the available processors in order of preference. */ +#ifdef HAVE_AVX512F + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - /* Computing MIN */ - i3 = 256; - i4 = k - ll + 1; - lsec = min(i3,i4); - ulsec = lsec - lsec % 2; - - i3 = m; - for (ii = 1; ii <= i3; ii += 256) - { - /* Computing MIN */ - i4 = 256; - i5 = m - ii + 1; - isec = min(i4,i5); - uisec = isec - isec % 2; - i4 = ll + ulsec - 1; - for (l = ll; l <= i4; l += 2) - { - i5 = ii + uisec - 1; - for (i = ii; i <= i5; i += 2) - { - t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = - a[i + l * a_dim1]; - t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = - a[i + (l + 1) * a_dim1]; - t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = - a[i + 1 + l * a_dim1]; - t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = - a[i + 1 + (l + 1) * a_dim1]; - } - if (uisec < isec) - { - t1[l - ll + 1 + (isec << 8) - 257] = - a[ii + isec - 1 + l * a_dim1]; - t1[l - ll + 2 + (isec << 8) - 257] = - a[ii + isec - 1 + (l + 1) * a_dim1]; - } - } - if (ulsec < lsec) - { - i4 = ii + isec - 1; - for (i = ii; i<= i4; ++i) - { - t1[lsec + ((i - ii + 1) << 8) - 257] = - a[i + (ll + lsec - 1) * a_dim1]; - } - } - - uisec = isec - isec % 4; - i4 = jj + ujsec - 1; - for (j = jj; j <= i4; j += 4) - { - i5 = ii + uisec - 1; - for (i = ii; i <= i5; i += 4) - { - f11 = c[i + j * c_dim1]; - f21 = c[i + 1 + j * c_dim1]; - f12 = c[i + (j + 1) * c_dim1]; - f22 = c[i + 1 + (j + 1) * c_dim1]; - f13 = c[i + (j + 2) * c_dim1]; - f23 = c[i + 1 + (j + 2) * c_dim1]; - f14 = c[i + (j + 3) * c_dim1]; - f24 = c[i + 1 + (j + 3) * c_dim1]; - f31 = c[i + 2 + j * c_dim1]; - f41 = c[i + 3 + j * c_dim1]; - f32 = c[i + 2 + (j + 1) * c_dim1]; - f42 = c[i + 3 + (j + 1) * c_dim1]; - f33 = c[i + 2 + (j + 2) * c_dim1]; - f43 = c[i + 3 + (j + 2) * c_dim1]; - f34 = c[i + 2 + (j + 3) * c_dim1]; - f44 = c[i + 3 + (j + 3) * c_dim1]; - i6 = ll + lsec - 1; - for (l = ll; l <= i6; ++l) - { - f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] - * b[l + j * b_dim1]; - f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] - * b[l + j * b_dim1]; - f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] - * b[l + (j + 1) * b_dim1]; - f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] - * b[l + (j + 1) * b_dim1]; - f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] - * b[l + (j + 2) * b_dim1]; - f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] - * b[l + (j + 2) * b_dim1]; - f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] - * b[l + (j + 3) * b_dim1]; - f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] - * b[l + (j + 3) * b_dim1]; - f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] - * b[l + j * b_dim1]; - f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] - * b[l + j * b_dim1]; - f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] - * b[l + (j + 1) * b_dim1]; - f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] - * b[l + (j + 1) * b_dim1]; - f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] - * b[l + (j + 2) * b_dim1]; - f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] - * b[l + (j + 2) * b_dim1]; - f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] - * b[l + (j + 3) * b_dim1]; - f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] - * b[l + (j + 3) * b_dim1]; - } - c[i + j * c_dim1] = f11; - c[i + 1 + j * c_dim1] = f21; - c[i + (j + 1) * c_dim1] = f12; - c[i + 1 + (j + 1) * c_dim1] = f22; - c[i + (j + 2) * c_dim1] = f13; - c[i + 1 + (j + 2) * c_dim1] = f23; - c[i + (j + 3) * c_dim1] = f14; - c[i + 1 + (j + 3) * c_dim1] = f24; - c[i + 2 + j * c_dim1] = f31; - c[i + 3 + j * c_dim1] = f41; - c[i + 2 + (j + 1) * c_dim1] = f32; - c[i + 3 + (j + 1) * c_dim1] = f42; - c[i + 2 + (j + 2) * c_dim1] = f33; - c[i + 3 + (j + 2) * c_dim1] = f43; - c[i + 2 + (j + 3) * c_dim1] = f34; - c[i + 3 + (j + 3) * c_dim1] = f44; - } - if (uisec < isec) - { - i5 = ii + isec - 1; - for (i = ii + uisec; i <= i5; ++i) - { - f11 = c[i + j * c_dim1]; - f12 = c[i + (j + 1) * c_dim1]; - f13 = c[i + (j + 2) * c_dim1]; - f14 = c[i + (j + 3) * c_dim1]; - i6 = ll + lsec - 1; - for (l = ll; l <= i6; ++l) - { - f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - - 257] * b[l + j * b_dim1]; - f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - - 257] * b[l + (j + 1) * b_dim1]; - f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - - 257] * b[l + (j + 2) * b_dim1]; - f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - - 257] * b[l + (j + 3) * b_dim1]; - } - c[i + j * c_dim1] = f11; - c[i + (j + 1) * c_dim1] = f12; - c[i + (j + 2) * c_dim1] = f13; - c[i + (j + 3) * c_dim1] = f14; - } - } - } - if (ujsec < jsec) - { - i4 = jj + jsec - 1; - for (j = jj + ujsec; j <= i4; ++j) - { - i5 = ii + uisec - 1; - for (i = ii; i <= i5; i += 4) - { - f11 = c[i + j * c_dim1]; - f21 = c[i + 1 + j * c_dim1]; - f31 = c[i + 2 + j * c_dim1]; - f41 = c[i + 3 + j * c_dim1]; - i6 = ll + lsec - 1; - for (l = ll; l <= i6; ++l) - { - f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - - 257] * b[l + j * b_dim1]; - f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - - 257] * b[l + j * b_dim1]; - f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - - 257] * b[l + j * b_dim1]; - f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - - 257] * b[l + j * b_dim1]; - } - c[i + j * c_dim1] = f11; - c[i + 1 + j * c_dim1] = f21; - c[i + 2 + j * c_dim1] = f31; - c[i + 3 + j * c_dim1] = f41; - } - i5 = ii + isec - 1; - for (i = ii + uisec; i <= i5; ++i) - { - f11 = c[i + j * c_dim1]; - i6 = ll + lsec - 1; - for (l = ll; l <= i6; ++l) - { - f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - - 257] * b[l + j * b_dim1]; - } - c[i + j * c_dim1] = f11; - } - } - } - } + matmul_p = matmul_'rtype_code`_avx512f; + goto tailcall; } - } - return; - } - else if (rxstride == 1 && aystride == 1 && bxstride == 1) - { - if (GFC_DESCRIPTOR_RANK (a) != 1) - { - const 'rtype_name` *restrict abase_x; - const 'rtype_name` *restrict bbase_y; - 'rtype_name` *restrict dest_y; - 'rtype_name` s; - for (y = 0; y < ycount; y++) - { - bbase_y = &bbase[y*bystride]; - dest_y = &dest[y*rystride]; - for (x = 0; x < xcount; x++) - { - abase_x = &abase[x*axstride]; - s = ('rtype_name`) 0; - for (n = 0; n < count; n++) - s += abase_x[n] * bbase_y[n]; - dest_y[x] = s; - } - } - } - else - { - const 'rtype_name` *restrict bbase_y; - 'rtype_name` s; +#endif /* HAVE_AVX512F */ - for (y = 0; y < ycount; y++) +#ifdef HAVE_AVX2 + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) { - bbase_y = &bbase[y*bystride]; - s = ('rtype_name`) 0; - for (n = 0; n < count; n++) - s += abase[n*axstride] * bbase_y[n]; - dest[y*rystride] = s; + matmul_p = matmul_'rtype_code`_avx2; + goto tailcall; } - } - } - else if (axstride < aystride) - { - for (y = 0; y < ycount; y++) - for (x = 0; x < xcount; x++) - dest[x*rxstride + y*rystride] = ('rtype_name`)0; - - for (y = 0; y < ycount; y++) - for (n = 0; n < count; n++) - for (x = 0; x < xcount; x++) - /* dest[x,y] += a[x,n] * b[n,y] */ - dest[x*rxstride + y*rystride] += - abase[x*axstride + n*aystride] * - bbase[n*bxstride + y*bystride]; - } - else if (GFC_DESCRIPTOR_RANK (a) == 1) - { - const 'rtype_name` *restrict bbase_y; - 'rtype_name` s; - for (y = 0; y < ycount; y++) - { - bbase_y = &bbase[y*bystride]; - s = ('rtype_name`) 0; - for (n = 0; n < count; n++) - s += abase[n*axstride] * bbase_y[n*bxstride]; - dest[y*rxstride] = s; - } - } - else - { - const 'rtype_name` *restrict abase_x; - const 'rtype_name` *restrict bbase_y; - 'rtype_name` *restrict dest_y; - 'rtype_name` s; +#endif - for (y = 0; y < ycount; y++) - { - bbase_y = &bbase[y*bystride]; - dest_y = &dest[y*rystride]; - for (x = 0; x < xcount; x++) - { - abase_x = &abase[x*axstride]; - s = ('rtype_name`) 0; - for (n = 0; n < count; n++) - s += abase_x[n*aystride] * bbase_y[n*bxstride]; - dest_y[x*rxstride] = s; +#ifdef HAVE_AVX + if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) + { + matmul_p = matmul_'rtype_code`_avx; + goto tailcall; } - } - } -}' +#endif /* HAVE_AVX */ + } + } + +tailcall: + (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); +} + +#else /* Just the vanilla function. */ + +'define(`matmul_name',`matmul_'rtype_code)dnl +define(`target_attribute',`')dnl +include(matmul_internal.m4)dnl +`#endif #endif +' diff --git a/libgfortran/m4/matmul_internal.m4 b/libgfortran/m4/matmul_internal.m4 new file mode 100644 index 0000000..d35968b --- /dev/null +++ b/libgfortran/m4/matmul_internal.m4 @@ -0,0 +1,537 @@ +`void +'matmul_name` ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) +{ + const 'rtype_name` * restrict abase; + const 'rtype_name` * restrict bbase; + 'rtype_name` * restrict dest; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; + + assert (GFC_DESCRIPTOR_RANK (a) == 2 + || GFC_DESCRIPTOR_RANK (b) == 2); + +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. +*/ + + if (retarray->base_addr == NULL) + { + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + } + else + { + GFC_DIMENSION_SET(retarray->dim[0], 0, + GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1); + + GFC_DIMENSION_SET(retarray->dim[1], 0, + GFC_DESCRIPTOR_EXTENT(b,1) - 1, + GFC_DESCRIPTOR_EXTENT(retarray,0)); + } + + retarray->base_addr + = xmallocarray (size0 ((array_t *) retarray), sizeof ('rtype_name`)); + retarray->offset = 0; + } + else if (unlikely (compile_options.bounds_check)) + { + index_type ret_extent, arg_extent; + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else if (GFC_DESCRIPTOR_RANK (b) == 1) + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic: is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + else + { + arg_extent = GFC_DESCRIPTOR_EXTENT(a,0); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 1:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + + arg_extent = GFC_DESCRIPTOR_EXTENT(b,1); + ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1); + if (arg_extent != ret_extent) + runtime_error ("Incorrect extent in return array in" + " MATMUL intrinsic for dimension 2:" + " is %ld, should be %ld", + (long int) ret_extent, (long int) arg_extent); + } + } +' +sinclude(`matmul_asm_'rtype_code`.m4')dnl +` + if (GFC_DESCRIPTOR_RANK (retarray) == 1) + { + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0); + } + else + { + rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0); + rystride = GFC_DESCRIPTOR_STRIDE(retarray,1); + } + + + if (GFC_DESCRIPTOR_RANK (a) == 1) + { + /* Treat it as a a row matrix A[1,count]. */ + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = 1; + + xcount = 1; + count = GFC_DESCRIPTOR_EXTENT(a,0); + } + else + { + axstride = GFC_DESCRIPTOR_STRIDE(a,0); + aystride = GFC_DESCRIPTOR_STRIDE(a,1); + + count = GFC_DESCRIPTOR_EXTENT(a,1); + xcount = GFC_DESCRIPTOR_EXTENT(a,0); + } + + if (count != GFC_DESCRIPTOR_EXTENT(b,0)) + { + if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0) + runtime_error ("dimension of array B incorrect in MATMUL intrinsic"); + } + + if (GFC_DESCRIPTOR_RANK (b) == 1) + { + /* Treat it as a column matrix B[count,1] */ + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; + ycount = 1; + } + else + { + bxstride = GFC_DESCRIPTOR_STRIDE(b,0); + bystride = GFC_DESCRIPTOR_STRIDE(b,1); + ycount = GFC_DESCRIPTOR_EXTENT(b,1); + } + + abase = a->base_addr; + bbase = b->base_addr; + dest = retarray->base_addr; + + /* Now that everything is set up, we perform the multiplication + itself. */ + +#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x))) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) + + if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1) + && (bxstride == 1 || bystride == 1) + && (((float) xcount) * ((float) ycount) * ((float) count) + > POW3(blas_limit))) + { + const int m = xcount, n = ycount, k = count, ldc = rystride; + const 'rtype_name` one = 1, zero = 0; + const int lda = (axstride == 1) ? aystride : axstride, + ldb = (bxstride == 1) ? bystride : bxstride; + + if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1) + { + assert (gemm != NULL); + gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, + &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest, + &ldc, 1, 1); + return; + } + } + + if (rxstride == 1 && axstride == 1 && bxstride == 1) + { + /* This block of code implements a tuned matmul, derived from + Superscalar GEMM-based level 3 BLAS, Beta version 0.1 + + Bo Kagstrom and Per Ling + Department of Computing Science + Umea University + S-901 87 Umea, Sweden + + from netlib.org, translated to C, and modified for matmul.m4. */ + + const 'rtype_name` *a, *b; + 'rtype_name` *c; + const index_type m = xcount, n = ycount, k = count; + + /* System generated locals */ + index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, + i1, i2, i3, i4, i5, i6; + + /* Local variables */ + 'rtype_name` t1[65536], /* was [256][256] */ + f11, f12, f21, f22, f31, f32, f41, f42, + f13, f14, f23, f24, f33, f34, f43, f44; + index_type i, j, l, ii, jj, ll; + index_type isec, jsec, lsec, uisec, ujsec, ulsec; + + a = abase; + b = bbase; + c = retarray->base_addr; + + /* Parameter adjustments */ + c_dim1 = rystride; + c_offset = 1 + c_dim1; + c -= c_offset; + a_dim1 = aystride; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = bystride; + b_offset = 1 + b_dim1; + b -= b_offset; + + /* Early exit if possible */ + if (m == 0 || n == 0 || k == 0) + return; + + /* Empty c first. */ + for (j=1; j<=n; j++) + for (i=1; i<=m; i++) + c[i + j * c_dim1] = ('rtype_name`)0; + + /* Start turning the crank. */ + i1 = n; + for (jj = 1; jj <= i1; jj += 512) + { + /* Computing MIN */ + i2 = 512; + i3 = n - jj + 1; + jsec = min(i2,i3); + ujsec = jsec - jsec % 4; + i2 = k; + for (ll = 1; ll <= i2; ll += 256) + { + /* Computing MIN */ + i3 = 256; + i4 = k - ll + 1; + lsec = min(i3,i4); + ulsec = lsec - lsec % 2; + + i3 = m; + for (ii = 1; ii <= i3; ii += 256) + { + /* Computing MIN */ + i4 = 256; + i5 = m - ii + 1; + isec = min(i4,i5); + uisec = isec - isec % 2; + i4 = ll + ulsec - 1; + for (l = ll; l <= i4; l += 2) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 2) + { + t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] = + a[i + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] = + a[i + (l + 1) * a_dim1]; + t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + l * a_dim1]; + t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] = + a[i + 1 + (l + 1) * a_dim1]; + } + if (uisec < isec) + { + t1[l - ll + 1 + (isec << 8) - 257] = + a[ii + isec - 1 + l * a_dim1]; + t1[l - ll + 2 + (isec << 8) - 257] = + a[ii + isec - 1 + (l + 1) * a_dim1]; + } + } + if (ulsec < lsec) + { + i4 = ii + isec - 1; + for (i = ii; i<= i4; ++i) + { + t1[lsec + ((i - ii + 1) << 8) - 257] = + a[i + (ll + lsec - 1) * a_dim1]; + } + } + + uisec = isec - isec % 4; + i4 = jj + ujsec - 1; + for (j = jj; j <= i4; j += 4) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f22 = c[i + 1 + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f23 = c[i + 1 + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + f24 = c[i + 1 + (j + 3) * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + f32 = c[i + 2 + (j + 1) * c_dim1]; + f42 = c[i + 3 + (j + 1) * c_dim1]; + f33 = c[i + 2 + (j + 2) * c_dim1]; + f43 = c[i + 3 + (j + 2) * c_dim1]; + f34 = c[i + 2 + (j + 3) * c_dim1]; + f44 = c[i + 3 + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + j * b_dim1]; + f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 1) * b_dim1]; + f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 2) * b_dim1]; + f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257] + * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + (j + 1) * c_dim1] = f12; + c[i + 1 + (j + 1) * c_dim1] = f22; + c[i + (j + 2) * c_dim1] = f13; + c[i + 1 + (j + 2) * c_dim1] = f23; + c[i + (j + 3) * c_dim1] = f14; + c[i + 1 + (j + 3) * c_dim1] = f24; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + c[i + 2 + (j + 1) * c_dim1] = f32; + c[i + 3 + (j + 1) * c_dim1] = f42; + c[i + 2 + (j + 2) * c_dim1] = f33; + c[i + 3 + (j + 2) * c_dim1] = f43; + c[i + 2 + (j + 3) * c_dim1] = f34; + c[i + 3 + (j + 3) * c_dim1] = f44; + } + if (uisec < isec) + { + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + f12 = c[i + (j + 1) * c_dim1]; + f13 = c[i + (j + 2) * c_dim1]; + f14 = c[i + (j + 3) * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 1) * b_dim1]; + f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 2) * b_dim1]; + f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + (j + 3) * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + (j + 1) * c_dim1] = f12; + c[i + (j + 2) * c_dim1] = f13; + c[i + (j + 3) * c_dim1] = f14; + } + } + } + if (ujsec < jsec) + { + i4 = jj + jsec - 1; + for (j = jj + ujsec; j <= i4; ++j) + { + i5 = ii + uisec - 1; + for (i = ii; i <= i5; i += 4) + { + f11 = c[i + j * c_dim1]; + f21 = c[i + 1 + j * c_dim1]; + f31 = c[i + 2 + j * c_dim1]; + f41 = c[i + 3 + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - + 257] * b[l + j * b_dim1]; + f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - + 257] * b[l + j * b_dim1]; + f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + c[i + 1 + j * c_dim1] = f21; + c[i + 2 + j * c_dim1] = f31; + c[i + 3 + j * c_dim1] = f41; + } + i5 = ii + isec - 1; + for (i = ii + uisec; i <= i5; ++i) + { + f11 = c[i + j * c_dim1]; + i6 = ll + lsec - 1; + for (l = ll; l <= i6; ++l) + { + f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - + 257] * b[l + j * b_dim1]; + } + c[i + j * c_dim1] = f11; + } + } + } + } + } + } + return; + } + else if (rxstride == 1 && aystride == 1 && bxstride == 1) + { + if (GFC_DESCRIPTOR_RANK (a) != 1) + { + const 'rtype_name` *restrict abase_x; + const 'rtype_name` *restrict bbase_y; + 'rtype_name` *restrict dest_y; + 'rtype_name` s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = ('rtype_name`) 0; + for (n = 0; n < count; n++) + s += abase_x[n] * bbase_y[n]; + dest_y[x] = s; + } + } + } + else + { + const 'rtype_name` *restrict bbase_y; + 'rtype_name` s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = ('rtype_name`) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n]; + dest[y*rystride] = s; + } + } + } + else if (axstride < aystride) + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = ('rtype_name`)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += + abase[x*axstride + n*aystride] * + bbase[n*bxstride + y*bystride]; + } + else if (GFC_DESCRIPTOR_RANK (a) == 1) + { + const 'rtype_name` *restrict bbase_y; + 'rtype_name` s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + s = ('rtype_name`) 0; + for (n = 0; n < count; n++) + s += abase[n*axstride] * bbase_y[n*bxstride]; + dest[y*rxstride] = s; + } + } + else + { + const 'rtype_name` *restrict abase_x; + const 'rtype_name` *restrict bbase_y; + 'rtype_name` *restrict dest_y; + 'rtype_name` s; + + for (y = 0; y < ycount; y++) + { + bbase_y = &bbase[y*bystride]; + dest_y = &dest[y*rystride]; + for (x = 0; x < xcount; x++) + { + abase_x = &abase[x*axstride]; + s = ('rtype_name`) 0; + for (n = 0; n < count; n++) + s += abase_x[n*aystride] * bbase_y[n*bxstride]; + dest_y[x*rxstride] = s; + } + } + } +} +#undef POW3 +#undef min +#undef max +' -- cgit v1.1 From f9b6b9291db6eab79e3f4c177cf4cf264aac6dda Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Sat, 3 Dec 2016 16:10:43 +0000 Subject: baseline_symbols.txt: Regenerate. * config/abi/post/hppa-linux-gnu/baseline_symbols.txt: Regenerate. From-SVN: r243220 --- libstdc++-v3/ChangeLog | 4 + .../abi/post/hppa-linux-gnu/baseline_symbols.txt | 95 ++++++++++++++++++++++ 2 files changed, 99 insertions(+) diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 08d9229..451b6ee 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,7 @@ +2016-12-03 John David Anglin + + * config/abi/post/hppa-linux-gnu/baseline_symbols.txt: Regenerate. + 2016-12-01 David Edelsohn * testsuite/26_numerics/headers/cmath/hypot.cc: XFAIL on AIX. diff --git a/libstdc++-v3/config/abi/post/hppa-linux-gnu/baseline_symbols.txt b/libstdc++-v3/config/abi/post/hppa-linux-gnu/baseline_symbols.txt index ab37045..47b4b62 100644 --- a/libstdc++-v3/config/abi/post/hppa-linux-gnu/baseline_symbols.txt +++ b/libstdc++-v3/config/abi/post/hppa-linux-gnu/baseline_symbols.txt @@ -1,3 +1,72 @@ +FUNC:_ZGTtNKSt11logic_error4whatEv@@GLIBCXX_3.4.22 +FUNC:_ZGTtNKSt13bad_exception4whatEv@@CXXABI_1.3.10 +FUNC:_ZGTtNKSt13bad_exceptionD1Ev@@CXXABI_1.3.10 +FUNC:_ZGTtNKSt13runtime_error4whatEv@@GLIBCXX_3.4.22 +FUNC:_ZGTtNKSt9exception4whatEv@@CXXABI_1.3.10 +FUNC:_ZGTtNKSt9exceptionD1Ev@@CXXABI_1.3.10 +FUNC:_ZGTtNSt11logic_errorC1EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11logic_errorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11logic_errorC2EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11logic_errorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11logic_errorD0Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11logic_errorD1Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11logic_errorD2Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11range_errorC1EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11range_errorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11range_errorC2EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11range_errorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11range_errorD0Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11range_errorD1Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt11range_errorD2Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12domain_errorC1EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12domain_errorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12domain_errorC2EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12domain_errorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12domain_errorD0Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12domain_errorD1Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12domain_errorD2Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12length_errorC1EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12length_errorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12length_errorC2EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12length_errorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12length_errorD0Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12length_errorD1Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12length_errorD2Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12out_of_rangeC1EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12out_of_rangeC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12out_of_rangeC2EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12out_of_rangeC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12out_of_rangeD0Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12out_of_rangeD1Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt12out_of_rangeD2Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt13runtime_errorC1EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt13runtime_errorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt13runtime_errorC2EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt13runtime_errorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt13runtime_errorD0Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt13runtime_errorD1Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt13runtime_errorD2Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt14overflow_errorC1EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt14overflow_errorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt14overflow_errorC2EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt14overflow_errorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt14overflow_errorD0Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt14overflow_errorD1Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt14overflow_errorD2Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt15underflow_errorC1EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt15underflow_errorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt15underflow_errorC2EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt15underflow_errorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt15underflow_errorD0Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt15underflow_errorD1Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt15underflow_errorD2Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt16invalid_argumentC1EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt16invalid_argumentC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt16invalid_argumentC2EPKc@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt16invalid_argumentC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt16invalid_argumentD0Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt16invalid_argumentD1Ev@@GLIBCXX_3.4.22 +FUNC:_ZGTtNSt16invalid_argumentD2Ev@@GLIBCXX_3.4.22 FUNC:_ZN10__cxxabiv116__enum_type_infoD0Ev@@CXXABI_1.3 FUNC:_ZN10__cxxabiv116__enum_type_infoD1Ev@@CXXABI_1.3 FUNC:_ZN10__cxxabiv116__enum_type_infoD2Ev@@CXXABI_1.3 @@ -100,6 +169,7 @@ FUNC:_ZN9__gnu_cxx6__poolILb1EE16_M_get_thread_idEv@@GLIBCXX_3.4.4 FUNC:_ZN9__gnu_cxx6__poolILb1EE16_M_reclaim_blockEPcj@@GLIBCXX_3.4.4 FUNC:_ZN9__gnu_cxx6__poolILb1EE16_M_reserve_blockEjj@@GLIBCXX_3.4.4 FUNC:_ZN9__gnu_cxx6__poolILb1EE21_M_destroy_thread_keyEPv@@GLIBCXX_3.4.4 +FUNC:_ZN9__gnu_cxx9__freeresEv@@CXXABI_1.3.10 FUNC:_ZN9__gnu_cxx9free_list6_M_getEj@@GLIBCXX_3.4.4 FUNC:_ZN9__gnu_cxx9free_list8_M_clearEv@@GLIBCXX_3.4.4 FUNC:_ZNK10__cxxabiv117__class_type_info10__do_catchEPKSt9type_infoPPvj@@CXXABI_1.3 @@ -1510,6 +1580,7 @@ FUNC:_ZNSsC1EPKcRKSaIcE@@GLIBCXX_3.4 FUNC:_ZNSsC1EPKcjRKSaIcE@@GLIBCXX_3.4 FUNC:_ZNSsC1ERKSaIcE@@GLIBCXX_3.4 FUNC:_ZNSsC1ERKSs@@GLIBCXX_3.4 +FUNC:_ZNSsC1ERKSsjRKSaIcE@@GLIBCXX_3.4.23 FUNC:_ZNSsC1ERKSsjj@@GLIBCXX_3.4 FUNC:_ZNSsC1ERKSsjjRKSaIcE@@GLIBCXX_3.4 FUNC:_ZNSsC1ESt16initializer_listIcERKSaIcE@@GLIBCXX_3.4.11 @@ -1523,6 +1594,7 @@ FUNC:_ZNSsC2EPKcRKSaIcE@@GLIBCXX_3.4 FUNC:_ZNSsC2EPKcjRKSaIcE@@GLIBCXX_3.4 FUNC:_ZNSsC2ERKSaIcE@@GLIBCXX_3.4 FUNC:_ZNSsC2ERKSs@@GLIBCXX_3.4 +FUNC:_ZNSsC2ERKSsjRKSaIcE@@GLIBCXX_3.4.23 FUNC:_ZNSsC2ERKSsjj@@GLIBCXX_3.4 FUNC:_ZNSsC2ERKSsjjRKSaIcE@@GLIBCXX_3.4 FUNC:_ZNSsC2ESt16initializer_listIcERKSaIcE@@GLIBCXX_3.4.11 @@ -2162,6 +2234,7 @@ FUNC:_ZNSt15_List_node_base8transferEPS_S0_@@GLIBCXX_3.4 FUNC:_ZNSt15_List_node_base9_M_unhookEv@@GLIBCXX_3.4.14 FUNC:_ZNSt15__exception_ptr13exception_ptr4swapERS0_@@CXXABI_1.3.3 FUNC:_ZNSt15__exception_ptr13exception_ptrC1EMS0_FvvE@@CXXABI_1.3.3 +FUNC:_ZNSt15__exception_ptr13exception_ptrC1EPv@@CXXABI_1.3.11 FUNC:_ZNSt15__exception_ptr13exception_ptrC1ERKS0_@@CXXABI_1.3.3 FUNC:_ZNSt15__exception_ptr13exception_ptrC1Ev@@CXXABI_1.3.3 FUNC:_ZNSt15__exception_ptr13exception_ptrC2EMS0_FvvE@@CXXABI_1.3.3 @@ -2707,7 +2780,9 @@ FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_disposeEv@@GLIBCX FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEjjPKcj@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_S_compareEjj@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE11_M_capacityEj@@GLIBCXX_3.4.21 +FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_Alloc_hiderC1EPcOS3_@@GLIBCXX_3.4.23 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_Alloc_hiderC1EPcRKS3_@@GLIBCXX_3.4.21 +FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_Alloc_hiderC2EPcOS3_@@GLIBCXX_3.4.23 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_Alloc_hiderC2EPcRKS3_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructEjc@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIN9__gnu_cxx17__normal_iteratorIPKcS4_EEEEvT_SB_St20forward_iterator_tag@@GLIBCXX_3.4.21 @@ -2806,6 +2881,7 @@ FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1EPKcjRKS3_@@GLIBCXX_ FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1ERKS3_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1ERKS4_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1ERKS4_RKS3_@@GLIBCXX_3.4.21 +FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1ERKS4_jRKS3_@@GLIBCXX_3.4.23 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1ERKS4_jj@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1ERKS4_jjRKS3_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC1ESt16initializer_listIcERKS3_@@GLIBCXX_3.4.21 @@ -2821,6 +2897,7 @@ FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC2EPKcjRKS3_@@GLIBCXX_ FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC2ERKS3_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC2ERKS4_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC2ERKS4_RKS3_@@GLIBCXX_3.4.21 +FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC2ERKS4_jRKS3_@@GLIBCXX_3.4.23 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC2ERKS4_jj@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC2ERKS4_jjRKS3_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEC2ESt16initializer_listIcERKS3_@@GLIBCXX_3.4.21 @@ -2846,7 +2923,9 @@ FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEE10_M_disposeEv@@GLIBCX FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEE10_M_replaceEjjPKwj@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEE10_S_compareEjj@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEE11_M_capacityEj@@GLIBCXX_3.4.21 +FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEE12_Alloc_hiderC1EPwOS3_@@GLIBCXX_3.4.23 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEE12_Alloc_hiderC1EPwRKS3_@@GLIBCXX_3.4.21 +FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEE12_Alloc_hiderC2EPwOS3_@@GLIBCXX_3.4.23 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEE12_Alloc_hiderC2EPwRKS3_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEE12_M_constructEjw@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEE12_M_constructIN9__gnu_cxx17__normal_iteratorIPKwS4_EEEEvT_SB_St20forward_iterator_tag@@GLIBCXX_3.4.21 @@ -2945,6 +3024,7 @@ FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC1EPKwjRKS3_@@GLIBCXX_ FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC1ERKS3_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC1ERKS4_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC1ERKS4_RKS3_@@GLIBCXX_3.4.21 +FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC1ERKS4_jRKS3_@@GLIBCXX_3.4.23 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC1ERKS4_jj@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC1ERKS4_jjRKS3_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC1ESt16initializer_listIwERKS3_@@GLIBCXX_3.4.21 @@ -2960,6 +3040,7 @@ FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC2EPKwjRKS3_@@GLIBCXX_ FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC2ERKS3_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC2ERKS4_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC2ERKS4_RKS3_@@GLIBCXX_3.4.21 +FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC2ERKS4_jRKS3_@@GLIBCXX_3.4.23 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC2ERKS4_jj@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC2ERKS4_jjRKS3_@@GLIBCXX_3.4.21 FUNC:_ZNSt7__cxx1112basic_stringIwSt11char_traitsIwESaIwEEC2ESt16initializer_listIwERKS3_@@GLIBCXX_3.4.21 @@ -3829,14 +3910,24 @@ FUNC:_ZTv0_n12_NSt9strstreamD0Ev@@GLIBCXX_3.4 FUNC:_ZTv0_n12_NSt9strstreamD1Ev@@GLIBCXX_3.4 FUNC:_ZdaPv@@GLIBCXX_3.4 FUNC:_ZdaPvRKSt9nothrow_t@@GLIBCXX_3.4 +FUNC:_ZdaPvSt11align_val_t@@CXXABI_1.3.11 +FUNC:_ZdaPvSt11align_val_tRKSt9nothrow_t@@CXXABI_1.3.11 FUNC:_ZdaPvj@@CXXABI_1.3.9 +FUNC:_ZdaPvjSt11align_val_t@@CXXABI_1.3.11 FUNC:_ZdlPv@@GLIBCXX_3.4 FUNC:_ZdlPvRKSt9nothrow_t@@GLIBCXX_3.4 +FUNC:_ZdlPvSt11align_val_t@@CXXABI_1.3.11 +FUNC:_ZdlPvSt11align_val_tRKSt9nothrow_t@@CXXABI_1.3.11 FUNC:_ZdlPvj@@CXXABI_1.3.9 +FUNC:_ZdlPvjSt11align_val_t@@CXXABI_1.3.11 FUNC:_Znaj@@GLIBCXX_3.4 FUNC:_ZnajRKSt9nothrow_t@@GLIBCXX_3.4 +FUNC:_ZnajSt11align_val_t@@CXXABI_1.3.11 +FUNC:_ZnajSt11align_val_tRKSt9nothrow_t@@CXXABI_1.3.11 FUNC:_Znwj@@GLIBCXX_3.4 FUNC:_ZnwjRKSt9nothrow_t@@GLIBCXX_3.4 +FUNC:_ZnwjSt11align_val_t@@CXXABI_1.3.11 +FUNC:_ZnwjSt11align_val_tRKSt9nothrow_t@@CXXABI_1.3.11 FUNC:__atomic_flag_for_address@@GLIBCXX_3.4.11 FUNC:__atomic_flag_wait_explicit@@GLIBCXX_3.4.11 FUNC:__cxa_allocate_dependent_exception@@CXXABI_1.3.6 @@ -3857,6 +3948,7 @@ FUNC:__cxa_get_globals_fast@@CXXABI_1.3 FUNC:__cxa_guard_abort@@CXXABI_1.3 FUNC:__cxa_guard_acquire@@CXXABI_1.3 FUNC:__cxa_guard_release@@CXXABI_1.3 +FUNC:__cxa_init_primary_exception@@CXXABI_1.3.11 FUNC:__cxa_pure_virtual@@CXXABI_1.3 FUNC:__cxa_rethrow@@CXXABI_1.3 FUNC:__cxa_thread_atexit@@CXXABI_1.3.7 @@ -3903,6 +3995,8 @@ FUNC:tanhl@GLIBCXX_3.4 FUNC:tanl@GLIBCXX_3.4 OBJECT:0:CXXABI_1.3 OBJECT:0:CXXABI_1.3.1 +OBJECT:0:CXXABI_1.3.10 +OBJECT:0:CXXABI_1.3.11 OBJECT:0:CXXABI_1.3.2 OBJECT:0:CXXABI_1.3.3 OBJECT:0:CXXABI_1.3.4 @@ -3928,6 +4022,7 @@ OBJECT:0:GLIBCXX_3.4.2 OBJECT:0:GLIBCXX_3.4.20 OBJECT:0:GLIBCXX_3.4.21 OBJECT:0:GLIBCXX_3.4.22 +OBJECT:0:GLIBCXX_3.4.23 OBJECT:0:GLIBCXX_3.4.3 OBJECT:0:GLIBCXX_3.4.4 OBJECT:0:GLIBCXX_3.4.5 -- cgit v1.1 From 7c3cace17c44e3fa1bf8f7128d2f9b8fefb4f865 Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Sat, 3 Dec 2016 17:29:43 +0000 Subject: constraints.md (U): Adjust comment. * config/sparc/constraints.md (U): Adjust comment. * config/sparc/sparc.md (lra): New attribute. (enabled): For base instructions, if the lra attribute is set, return 1 if it is in keeping with TARGET_LRA. (*movdi_insn_sp32): Add lra attribute for alternatives mentioning U constraint and duplicate them with U replaced by r. (*movdf_insn_sp32): Likewise. (*mov_insn_sp32): Likewise. (*movtf_insn_sp32): Remove alternatives mentioning U constraint. From-SVN: r243221 --- gcc/ChangeLog | 15 +++++++++- gcc/config/sparc/constraints.md | 8 ++--- gcc/config/sparc/sparc.md | 65 +++++++++++++++++++++++++---------------- 3 files changed, 58 insertions(+), 30 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 96ae900..396580a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,4 +1,17 @@ -2016-12-01 Jeff Law +2016-12-03 Eric Botcazou + David S. Miller + + * config/sparc/constraints.md (U): Adjust comment. + * config/sparc/sparc.md (lra): New attribute. + (enabled): For base instructions, if the lra attribute is set, + return 1 if it is in keeping with TARGET_LRA. + (*movdi_insn_sp32): Add lra attribute for alternatives mentioning U + constraint and duplicate them with U replaced by r. + (*movdf_insn_sp32): Likewise. + (*mov_insn_sp32): Likewise. + (*movtf_insn_sp32): Remove alternatives mentioning U constraint. + +2016-12-02 Jeff Law * config/arm/arm.c (arm_handle_cmse_nonsecure_call): Remove unused variable main_variant. diff --git a/gcc/config/sparc/constraints.md b/gcc/config/sparc/constraints.md index 191c0bf..626ced7 100644 --- a/gcc/config/sparc/constraints.md +++ b/gcc/config/sparc/constraints.md @@ -166,10 +166,10 @@ ;; example, we have a non-offsetable MEM. Reload will notice this ;; case and reload the address into a single hard register. ;; -;; The real downfall of this awkward register constraint is that it does -;; not evaluate to a true register class like a bonafide use of -;; define_register_constraint would. This currently means that we cannot -;; use LRA on Sparc, since the constraint processing of LRA really depends +;; The real downfall of this awkward register constraint is that it +;; does not evaluate to a true register class like a bonafide use of +;; define_register_constraint would. This means that we cannot use +;; it with LRA, since the constraint processing of LRA really depends ;; upon whether an extra constraint is for registers or not. It uses ;; reg_class_for_constraint, and checks it against NO_REGS. (define_constraint "U" diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md index 896ce4b..000c5a3 100644 --- a/gcc/config/sparc/sparc.md +++ b/gcc/config/sparc/sparc.md @@ -254,8 +254,14 @@ (define_attr "cpu_feature" "none,fpu,fpunotv9,v9,vis,vis3,vis4" (const_string "none")) +(define_attr "lra" "none,disabled,enabled" + (const_string "none")) + (define_attr "enabled" "" - (cond [(eq_attr "cpu_feature" "none") (const_int 1) + (cond [(eq_attr "cpu_feature" "none") + (cond [(eq_attr "lra" "disabled") (symbol_ref "!TARGET_LRA") + (eq_attr "lra" "enabled") (symbol_ref "TARGET_LRA")] + (const_int 1)) (eq_attr "cpu_feature" "fpu") (symbol_ref "TARGET_FPU") (eq_attr "cpu_feature" "fpunotv9") (symbol_ref "TARGET_FPU && !TARGET_V9") (eq_attr "cpu_feature" "v9") (symbol_ref "TARGET_V9") @@ -1701,9 +1707,9 @@ (define_insn "*movdi_insn_sp32" [(set (match_operand:DI 0 "nonimmediate_operand" - "=T,o,T,U,o,r,r,r,?T,?*f,?*f,?o,?*e,?*e, r,?*f,?*e,?W,b,b") + "=T,o,T,T,U,r,o,r,r,r,?T,?*f,?*f,?o,?*e,?*e, r,?*f,?*e,?W,b,b") (match_operand:DI 1 "input_operand" - " J,J,U,T,r,o,i,r,*f, T, o,*f, *e, *e,?*f, r, W,*e,J,P"))] + " J,J,U,r,T,T,r,o,i,r,*f, T, o,*f, *e, *e,?*f, r, W,*e,J,P"))] "TARGET_ARCH32 && (register_operand (operands[0], DImode) || register_or_zero_operand (operands[1], DImode))" @@ -1711,6 +1717,8 @@ stx\t%%g0, %0 # std\t%1, %0 + std\t%1, %0 + ldd\t%1, %0 ldd\t%1, %0 # # @@ -1728,11 +1736,12 @@ std\t%1, %0 fzero\t%0 fone\t%0" - [(set_attr "type" "store,store,store,load,*,*,*,*,fpstore,fpload,*,*,fpmove,*,*,*,fpload,fpstore,visl,visl") - (set_attr "length" "*,2,*,*,2,2,2,2,*,*,2,2,*,2,2,2,*,*,*,*") - (set_attr "fptype" "*,*,*,*,*,*,*,*,*,*,*,*,double,*,*,*,*,*,double,double") - (set_attr "cpu_feature" "v9,*,*,*,*,*,*,*,fpu,fpu,fpu,fpu,v9,fpunotv9,vis3,vis3,fpu,fpu,vis,vis") - (set_attr "v3pipe" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,true,true")]) + [(set_attr "type" "store,store,store,store,load,load,*,*,*,*,fpstore,fpload,*,*,fpmove,*,*,*,fpload,fpstore,visl,visl") + (set_attr "length" "*,2,*,*,*,*,2,2,2,2,*,*,2,2,*,2,2,2,*,*,*,*") + (set_attr "fptype" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,double,*,*,*,*,*,double,double") + (set_attr "cpu_feature" "v9,*,*,*,*,*,*,*,*,*,fpu,fpu,fpu,fpu,v9,fpunotv9,vis3,vis3,fpu,fpu,vis,vis") + (set_attr "v3pipe" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,true,true") + (set_attr "lra" "*,*,disabled,enabled,disabled,enabled,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")]) (define_insn "*movdi_insn_sp64" [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,r, m, r,*e,?*e,?*e,?W,b,b") @@ -2372,9 +2381,9 @@ (define_insn "*movdf_insn_sp32" [(set (match_operand:DF 0 "nonimmediate_operand" - "=b,b,e,e,*r, f, e,T,W,U,T, f, *r, o,o") + "=b,b,e,e,*r, f, e,T,W,U,r,T,T, f, *r, o,o") (match_operand:DF 1 "input_operand" - " G,C,e,e, f,*r,W#F,G,e,T,U,o#F,*roF,*rG,f"))] + " G,C,e,e, f,*r,W#F,G,e,T,T,U,r,o#F,*roF,*rG,f"))] "TARGET_ARCH32 && (register_operand (operands[0], DFmode) || register_or_zero_or_all_ones_operand (operands[1], DFmode))" @@ -2389,16 +2398,19 @@ stx\t%r1, %0 std\t%1, %0 ldd\t%1, %0 + ldd\t%1, %0 + std\t%1, %0 std\t%1, %0 # # # #" - [(set_attr "type" "visl,visl,fpmove,*,*,*,fpload,store,fpstore,load,store,*,*,*,*") - (set_attr "length" "*,*,*,2,2,2,*,*,*,*,*,2,2,2,2") - (set_attr "fptype" "double,double,double,*,*,*,*,*,*,*,*,*,*,*,*") - (set_attr "cpu_feature" "vis,vis,v9,fpunotv9,vis3,vis3,fpu,v9,fpu,*,*,fpu,*,*,fpu") - (set_attr "v3pipe" "true,true,*,*,*,*,*,*,*,*,*,*,*,*,*")]) + [(set_attr "type" "visl,visl,fpmove,*,*,*,fpload,store,fpstore,load,load,store,store,*,*,*,*") + (set_attr "length" "*,*,*,2,2,2,*,*,*,*,*,*,*,2,2,2,2") + (set_attr "fptype" "double,double,double,*,*,*,*,*,*,*,*,*,*,*,*,*,*") + (set_attr "cpu_feature" "vis,vis,v9,fpunotv9,vis3,vis3,fpu,v9,fpu,*,*,*,*,fpu,*,*,fpu") + (set_attr "v3pipe" "true,true,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*") + (set_attr "lra" "*,*,*,*,*,*,*,*,*,disabled,enabled,disabled,enabled,*,*,*,*")]) (define_insn "*movdf_insn_sp64" [(set (match_operand:DF 0 "nonimmediate_operand" "=b,b,e,*r, e, e,W, *r,*r, m,*r") @@ -2609,14 +2621,14 @@ }) (define_insn "*movtf_insn_sp32" - [(set (match_operand:TF 0 "nonimmediate_operand" "=b, e,o, o,U, r") - (match_operand:TF 1 "input_operand" " G,oe,e,rGU,o,roG"))] + [(set (match_operand:TF 0 "nonimmediate_operand" "=b, e,o, o, r") + (match_operand:TF 1 "input_operand" " G,oe,e,rG,roG"))] "TARGET_ARCH32 && (register_operand (operands[0], TFmode) || register_or_zero_operand (operands[1], TFmode))" "#" - [(set_attr "length" "4,4,4,4,4,4") - (set_attr "cpu_feature" "fpu,fpu,fpu,*,*,*")]) + [(set_attr "length" "4,4,4,4,4") + (set_attr "cpu_feature" "fpu,fpu,fpu,*,*")]) (define_insn "*movtf_insn_sp64" [(set (match_operand:TF 0 "nonimmediate_operand" "=b, e,o, o, r") @@ -8636,8 +8648,8 @@ (set_attr "v3pipe" "true,true,true,*,*,*,*,*,*,*,*")]) (define_insn "*mov_insn_sp32" - [(set (match_operand:VM64 0 "nonimmediate_operand" "=e,e,e,*r, f,e,m,m,U,T, o,*r") - (match_operand:VM64 1 "input_operand" "Y,C,e, f,*r,m,e,Y,T,U,*r,*r"))] + [(set (match_operand:VM64 0 "nonimmediate_operand" "=e,e,e,*r, f,e,m,m,U,r,T,T, o,*r") + (match_operand:VM64 1 "input_operand" "Y,C,e, f,*r,m,e,Y,T,T,U,r,*r,*r"))] "TARGET_VIS && TARGET_ARCH32 && (register_operand (operands[0], mode) @@ -8652,13 +8664,16 @@ std\t%1, %0 stx\t%r1, %0 ldd\t%1, %0 + ldd\t%1, %0 + std\t%1, %0 std\t%1, %0 # #" - [(set_attr "type" "visl,visl,vismv,*,*,fpload,fpstore,store,load,store,*,*") - (set_attr "length" "*,*,*,2,2,*,*,*,*,*,2,2") - (set_attr "cpu_feature" "vis,vis,vis,vis3,vis3,*,*,*,*,*,*,*") - (set_attr "v3pipe" "true,true,true,*,*,*,*,*,*,*,*,*")]) + [(set_attr "type" "visl,visl,vismv,*,*,fpload,fpstore,store,load,load,store,store,*,*") + (set_attr "length" "*,*,*,2,2,*,*,*,*,*,*,*,2,2") + (set_attr "cpu_feature" "vis,vis,vis,vis3,vis3,*,*,*,*,*,*,*,*,*") + (set_attr "v3pipe" "true,true,true,*,*,*,*,*,*,*,*,*,*,*") + (set_attr "lra" "*,*,*,*,*,*,*,*,disabled,enabled,disabled,enabled,*,*")]) (define_split [(set (match_operand:VM64 0 "memory_operand" "") -- cgit v1.1 From 54b84aa9456004ceee2bceb813843525ee11bd7d Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Sat, 3 Dec 2016 17:37:13 +0000 Subject: lra-constraints.c (emit_spill_move): Use gen_lowpart_SUBREG in all cases to build a lowpart SUBREG. * lra-constraints.c (emit_spill_move): Use gen_lowpart_SUBREG in all cases to build a lowpart SUBREG. From-SVN: r243222 --- gcc/ChangeLog | 5 +++++ gcc/lra-constraints.c | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 396580a..496c691 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,4 +1,9 @@ 2016-12-03 Eric Botcazou + + * lra-constraints.c (emit_spill_move): Use gen_lowpart_SUBREG in all + cases to build a lowpart SUBREG. + +2016-12-03 Eric Botcazou David S. Miller * config/sparc/constraints.md (U): Adjust comment. diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c index 260591a..e661aef 100644 --- a/gcc/lra-constraints.c +++ b/gcc/lra-constraints.c @@ -1109,9 +1109,9 @@ emit_spill_move (bool to_p, rtx mem_pseudo, rtx val) dependent macro HARD_REGNO_CALLER_SAVE_MODE. */ if (! MEM_P (val)) { - val = gen_rtx_SUBREG (GET_MODE (mem_pseudo), - GET_CODE (val) == SUBREG ? SUBREG_REG (val) : val, - 0); + val = gen_lowpart_SUBREG (GET_MODE (mem_pseudo), + GET_CODE (val) == SUBREG + ? SUBREG_REG (val) : val); LRA_SUBREG_P (val) = 1; } else -- cgit v1.1 From 20fee4a901c0cf097df1f1ba00e226f1eb8e973c Mon Sep 17 00:00:00 2001 From: Janus Weil Date: Sat, 3 Dec 2016 19:37:57 +0100 Subject: re PR fortran/42188 ([OOP] F03:C612. The leftmost part-name shall be the name of a data object.) 2016-12-03 Janus Weil PR fortran/42188 * primary.c (gfc_match_rvalue): Add a new check that gives better error messages. 2016-12-03 Janus Weil PR fortran/42188 * gfortran.dg/derived_result_2.f90.f90: New test case. From-SVN: r243223 --- gcc/fortran/ChangeLog | 6 ++++ gcc/fortran/primary.c | 9 ++++++ gcc/testsuite/ChangeLog | 5 +++ gcc/testsuite/gfortran.dg/derived_result_2.f90 | 45 ++++++++++++++++++++++++++ 4 files changed, 65 insertions(+) create mode 100644 gcc/testsuite/gfortran.dg/derived_result_2.f90 diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index 7a007c3..eaae696 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,5 +1,11 @@ 2016-12-03 Janus Weil + PR fortran/42188 + * primary.c (gfc_match_rvalue): Add a new check that gives better error + messages. + +2016-12-03 Janus Weil + PR fortran/58175 * resolve.c (gfc_resolve_finalizers): Prevent bogus warning. diff --git a/gcc/fortran/primary.c b/gcc/fortran/primary.c index eb2d780..2cdc9a4 100644 --- a/gcc/fortran/primary.c +++ b/gcc/fortran/primary.c @@ -3298,6 +3298,15 @@ gfc_match_rvalue (gfc_expr **result) if (sym->result == NULL) sym->result = sym; + gfc_gobble_whitespace (); + /* F08:C612. */ + if (gfc_peek_ascii_char() == '%') + { + gfc_error ("The leftmost part-ref in a data-ref can not be a " + "function reference at %C"); + m = MATCH_ERROR; + } + m = MATCH_YES; break; diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 39a5c59..0d95973 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,10 @@ 2016-12-03 Janus Weil + PR fortran/42188 + * gfortran.dg/derived_result_2.f90.f90: New test case. + +2016-12-03 Janus Weil + PR fortran/58175 * gfortran.dg/finalize_30.f90: Extend test case. diff --git a/gcc/testsuite/gfortran.dg/derived_result_2.f90 b/gcc/testsuite/gfortran.dg/derived_result_2.f90 new file mode 100644 index 0000000..51f5b86 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/derived_result_2.f90 @@ -0,0 +1,45 @@ +! { dg-do compile } +! +! PR 42188: [OOP] F03:C612. The leftmost part-name shall be the name of a data object +! +! Contributed by Janus Weil + +module grid_module + implicit none + type grid + contains + procedure :: new_grid + procedure :: new_int + end type +contains + subroutine new_grid(this) + class(grid) :: this + end subroutine + integer function new_int(this) + class(grid) :: this + new_int = 42 + end function +end module + +module field_module + use grid_module + implicit none + + type field + type(grid) :: mesh + end type + +contains + + type(field) function new_field() + end function + + subroutine test + integer :: i + type(grid) :: g + g = new_field()%mesh ! { dg-error "can not be a function reference" } + call new_field()%mesh%new_grid() ! { dg-error "Syntax error" } + i = new_field() % mesh%new_int() ! { dg-error "can not be a function reference" } + end subroutine + +end module -- cgit v1.1 From 4ceda20498ec82d0fd24a2ab1e56c6c1e38b0174 Mon Sep 17 00:00:00 2001 From: Janus Weil Date: Sat, 3 Dec 2016 19:48:48 +0100 Subject: re PR fortran/43207 ([OOP] invalid (pointer) assignment to and from abstract non-polymorphic expressions) 2016-12-03 Janus Weil PR fortran/43207 * primary.c (gfc_match_varspec): Reject nonpolymorphic references to abstract types. 2016-12-03 Janus Weil PR fortran/43207 * gfortran.dg/abstract_type_9.f90: New test case. From-SVN: r243224 --- gcc/fortran/ChangeLog | 6 +++++ gcc/fortran/primary.c | 10 +++++++- gcc/testsuite/ChangeLog | 5 ++++ gcc/testsuite/gfortran.dg/abstract_type_9.f90 | 34 +++++++++++++++++++++++++++ 4 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gfortran.dg/abstract_type_9.f90 diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index eaae696..3489bc4 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,5 +1,11 @@ 2016-12-03 Janus Weil + PR fortran/43207 + * primary.c (gfc_match_varspec): Reject nonpolymorphic references to + abstract types. + +2016-12-03 Janus Weil + PR fortran/42188 * primary.c (gfc_match_rvalue): Add a new check that gives better error messages. diff --git a/gcc/fortran/primary.c b/gcc/fortran/primary.c index 2cdc9a4..f13b0f0 100644 --- a/gcc/fortran/primary.c +++ b/gcc/fortran/primary.c @@ -2222,7 +2222,15 @@ check_substring: } } - /* F2008, C727. */ + /* F08:C611. */ + if (primary->ts.type == BT_DERIVED && primary->ref + && primary->ts.u.derived && primary->ts.u.derived->attr.abstract) + { + gfc_error ("Nonpolymorphic reference to abstract type at %C"); + return MATCH_ERROR; + } + + /* F08:C727. */ if (primary->expr_type == EXPR_PPC && gfc_is_coindexed (primary)) { gfc_error ("Coindexed procedure-pointer component at %C"); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 0d95973..84fc6fa 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,10 @@ 2016-12-03 Janus Weil + PR fortran/43207 + * gfortran.dg/abstract_type_9.f90: New test case. + +2016-12-03 Janus Weil + PR fortran/42188 * gfortran.dg/derived_result_2.f90.f90: New test case. diff --git a/gcc/testsuite/gfortran.dg/abstract_type_9.f90 b/gcc/testsuite/gfortran.dg/abstract_type_9.f90 new file mode 100644 index 0000000..77d48ba --- /dev/null +++ b/gcc/testsuite/gfortran.dg/abstract_type_9.f90 @@ -0,0 +1,34 @@ +! { dg-do compile } +! +! PR 43207: [OOP] invalid (pointer) assignment to and from abstract non-polymorphic expressions +! +! Contributed by Tobias Burnus + + implicit none + type, abstract :: parent + integer :: i + end type + type, extends(parent) :: child + class(parent), pointer :: comp + end type + + type(child), target :: c1 + class(child), allocatable :: c2 + class(parent), pointer :: cp + + c1%parent = c1%parent ! { dg-error "Nonpolymorphic reference to abstract type" } + c2%parent = c1%parent ! { dg-error "Nonpolymorphic reference to abstract type" } + + cp => c1%comp + cp => c1%parent ! { dg-error "Nonpolymorphic reference to abstract type" } + + call sub(c1%comp) + call sub(c1%parent) ! { dg-error "Nonpolymorphic reference to abstract type" } + +contains + + subroutine sub(arg) + class(parent) :: arg + end subroutine + +end -- cgit v1.1 From c818397a989cda38cf335a411360307dea311c99 Mon Sep 17 00:00:00 2001 From: GCC Administrator Date: Sun, 4 Dec 2016 00:16:17 +0000 Subject: Daily bump. From-SVN: r243227 --- gcc/DATESTAMP | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP index 8caabe1..f179554 100644 --- a/gcc/DATESTAMP +++ b/gcc/DATESTAMP @@ -1 +1 @@ -20161203 +20161204 -- cgit v1.1 From 6b7d84532342ed038a07d850ddacf7a86106a998 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 4 Dec 2016 15:38:05 +0100 Subject: re PR target/70322 (STV doesn't optimize andn) PR target/70322 * config/i386/i386.c (dimode_scalar_to_vector_candidate_p): Handle NEG. (dimode_scalar_chain::compute_convert_gain): Ditto. (dimode_scalar_chain::convert_insn): Ditto. testsuite/ChangeLog: PR target/70322 * gcc.target/i386/pr70322-4.c: New test. From-SVN: r243228 --- gcc/ChangeLog | 9 ++++++++- gcc/config/i386/i386.c | 15 +++++++++++++-- gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/gcc.target/i386/pr70322-4.c | 13 +++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr70322-4.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 496c691..e888c03 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,10 +1,17 @@ +2016-12-04 Uros Bizjak + + PR target/70322 + * config/i386/i386.c (dimode_scalar_to_vector_candidate_p): Handle NEG. + (dimode_scalar_chain::compute_convert_gain): Ditto. + (dimode_scalar_chain::convert_insn): Ditto. + 2016-12-03 Eric Botcazou * lra-constraints.c (emit_spill_move): Use gen_lowpart_SUBREG in all cases to build a lowpart SUBREG. 2016-12-03 Eric Botcazou - David S. Miller + David S. Miller * config/sparc/constraints.md (U): Adjust comment. * config/sparc/sparc.md (lra): New attribute. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 0bee09b..41717da 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2826,6 +2826,7 @@ dimode_scalar_to_vector_candidate_p (rtx_insn *insn) return false; break; + case NEG: case NOT: break; @@ -2851,7 +2852,8 @@ dimode_scalar_to_vector_candidate_p (rtx_insn *insn) if ((GET_MODE (XEXP (src, 0)) != DImode && !CONST_INT_P (XEXP (src, 0))) - || (GET_CODE (src) != NOT + || (GET_CODE (src) != NEG + && GET_CODE (src) != NOT && GET_MODE (XEXP (src, 1)) != DImode && !CONST_INT_P (XEXP (src, 1)))) return false; @@ -3419,7 +3421,8 @@ dimode_scalar_chain::compute_convert_gain () if (CONST_INT_P (XEXP (src, 1))) gain -= vector_const_cost (XEXP (src, 1)); } - else if (GET_CODE (src) == NOT) + else if (GET_CODE (src) == NEG + || GET_CODE (src) == NOT) gain += ix86_cost->add - COSTS_N_INSNS (1); else if (GET_CODE (src) == COMPARE) { @@ -3776,6 +3779,14 @@ dimode_scalar_chain::convert_insn (rtx_insn *insn) PUT_MODE (src, V2DImode); break; + case NEG: + src = XEXP (src, 0); + convert_op (&src, insn); + subreg = gen_reg_rtx (V2DImode); + emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn); + src = gen_rtx_MINUS (V2DImode, subreg, src); + break; + case NOT: src = XEXP (src, 0); convert_op (&src, insn); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 84fc6fa..a323678 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-04 Uros Bizjak + + PR target/70322 + * gcc.target/i386/pr70322-4.c: New test. + 2016-12-03 Janus Weil PR fortran/43207 diff --git a/gcc/testsuite/gcc.target/i386/pr70322-4.c b/gcc/testsuite/gcc.target/i386/pr70322-4.c new file mode 100644 index 0000000..8a02b9b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr70322-4.c @@ -0,0 +1,13 @@ +/* PR target/70322 */ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-O2 -msse2 -mstv" } */ +/* { dg-final { scan-assembler "psub" } } */ +/* { dg-final { scan-assembler "por" } } */ + +extern long long z; + +void +foo (long long x, long long y) +{ + z = -x | y; +} -- cgit v1.1 From 5dcf45618031c9d0e44ba306102a6562eed879d3 Mon Sep 17 00:00:00 2001 From: Martin Sebor Date: Sun, 4 Dec 2016 17:48:44 +0000 Subject: PR c/78668 - aligned_alloc, realloc, et al. missing attribute alloc_size gcc/ChangeLog: PR c/78668 * builtin-attrs.def (ATTR_ALLOC_SIZE, ATTR_RETURNS_NONNULL): New identifier tree nodes. (ATTR_ALLOCA_SIZE_1_NOTHROW_LEAF_LIST): New attribute list. (ATTR_MALLOC_SIZE_1_NOTHROW_LIST): Same. (ATTR_MALLOC_SIZE_1_NOTHROW_LEAF_LIST): Same. (ATTR_MALLOC_SIZE_1_2_NOTHROW_LEAF_LIST): Same. (ATTR_ALLOC_SIZE_2_NOTHROW_LEAF_LIST): Same. * builtins.def (aligned_alloc, calloc, malloc, realloc): Add attribute alloc_size. (alloca): Add attribute alloc_size and returns_nonnull. gcc/testsuite/ChangeLog: PR c/78668 * gcc.dg/builtin-alloc-size.c: New test. From-SVN: r243231 --- gcc/ChangeLog | 14 ++++++ gcc/builtin-attrs.def | 21 +++++++++ gcc/builtins.def | 10 ++--- gcc/testsuite/ChangeLog | 5 +++ gcc/testsuite/gcc.dg/builtin-alloc-size.c | 72 +++++++++++++++++++++++++++++++ 5 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/builtin-alloc-size.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e888c03..c33ad2f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,17 @@ +2016-12-04 Martin Sebor + + PR c/78668 + * builtin-attrs.def (ATTR_ALLOC_SIZE, ATTR_RETURNS_NONNULL): New + identifier tree nodes. + (ATTR_ALLOCA_SIZE_1_NOTHROW_LEAF_LIST): New attribute list. + (ATTR_MALLOC_SIZE_1_NOTHROW_LIST): Same. + (ATTR_MALLOC_SIZE_1_NOTHROW_LEAF_LIST): Same. + (ATTR_MALLOC_SIZE_1_2_NOTHROW_LEAF_LIST): Same. + (ATTR_ALLOC_SIZE_2_NOTHROW_LEAF_LIST): Same. + * builtins.def (aligned_alloc, calloc, malloc, realloc): + Add attribute alloc_size. + (alloca): Add attribute alloc_size and returns_nonnull. + 2016-12-04 Uros Bizjak PR target/70322 diff --git a/gcc/builtin-attrs.def b/gcc/builtin-attrs.def index 88c9bd1..1520d15 100644 --- a/gcc/builtin-attrs.def +++ b/gcc/builtin-attrs.def @@ -83,6 +83,7 @@ DEF_LIST_INT_INT (5,6) #undef DEF_LIST_INT_INT /* Construct trees for identifiers. */ +DEF_ATTR_IDENT (ATTR_ALLOC_SIZE, "alloc_size") DEF_ATTR_IDENT (ATTR_COLD, "cold") DEF_ATTR_IDENT (ATTR_CONST, "const") DEF_ATTR_IDENT (ATTR_FORMAT, "format") @@ -151,6 +152,26 @@ DEF_ATTR_TREE_LIST (ATTR_SENTINEL_NOTHROW_LEAF_LIST, ATTR_SENTINEL, \ DEF_ATTR_TREE_LIST (ATTR_COLD_CONST_NORETURN_NOTHROW_LEAF_LIST, ATTR_CONST,\ ATTR_NULL, ATTR_COLD_NORETURN_NOTHROW_LEAF_LIST) +/* Allocation functions like malloc and realloc whose first argument + specifies the size of the allocated object. */ +DEF_ATTR_TREE_LIST (ATTR_MALLOC_SIZE_1_NOTHROW_LIST, ATTR_ALLOC_SIZE, \ + ATTR_LIST_1, ATTR_MALLOC_NOTHROW_LIST) +DEF_ATTR_TREE_LIST (ATTR_MALLOC_SIZE_1_NOTHROW_LEAF_LIST, ATTR_ALLOC_SIZE, \ + ATTR_LIST_1, ATTR_MALLOC_NOTHROW_LEAF_LIST) +/* Alloca is just like malloc except that it never returns null. */ +DEF_ATTR_TREE_LIST (ATTR_ALLOCA_SIZE_1_NOTHROW_LEAF_LIST, ATTR_RETURNS_NONNULL, + ATTR_NULL, ATTR_MALLOC_SIZE_1_NOTHROW_LEAF_LIST) + +/* Allocation functions like calloc the product of whose first two arguments + specifies the size of the allocated object. */ +DEF_ATTR_TREE_LIST (ATTR_MALLOC_SIZE_1_2_NOTHROW_LEAF_LIST, ATTR_ALLOC_SIZE, \ + ATTR_LIST_1_2, ATTR_MALLOC_NOTHROW_LEAF_LIST) + +/* Allocation functions like realloc whose second argument specifies + the size of the allocated object. */ +DEF_ATTR_TREE_LIST (ATTR_ALLOC_SIZE_2_NOTHROW_LEAF_LIST, ATTR_ALLOC_SIZE, \ + ATTR_LIST_2, ATTR_NOTHROW_LEAF_LIST) + /* Functions whose pointer parameter(s) are all nonnull. */ DEF_ATTR_TREE_LIST (ATTR_NONNULL_LIST, ATTR_NONNULL, ATTR_NULL, ATTR_NULL) /* Functions whose first parameter is a nonnull pointer. */ diff --git a/gcc/builtins.def b/gcc/builtins.def index 6766975..9cd24e8 100644 --- a/gcc/builtins.def +++ b/gcc/builtins.def @@ -297,7 +297,7 @@ DEF_C99_BUILTIN (BUILT_IN_ACOSH, "acosh", BT_FN_DOUBLE_DOUBLE, ATTR_MATHF DEF_C99_BUILTIN (BUILT_IN_ACOSHF, "acoshf", BT_FN_FLOAT_FLOAT, ATTR_MATHFN_FPROUNDING_ERRNO) DEF_C99_BUILTIN (BUILT_IN_ACOSHL, "acoshl", BT_FN_LONGDOUBLE_LONGDOUBLE, ATTR_MATHFN_FPROUNDING_ERRNO) DEF_C99_C90RES_BUILTIN (BUILT_IN_ACOSL, "acosl", BT_FN_LONGDOUBLE_LONGDOUBLE, ATTR_MATHFN_FPROUNDING_ERRNO) -DEF_C11_BUILTIN (BUILT_IN_ALIGNED_ALLOC, "aligned_alloc", BT_FN_PTR_SIZE_SIZE, ATTR_MALLOC_NOTHROW_LIST) +DEF_C11_BUILTIN (BUILT_IN_ALIGNED_ALLOC, "aligned_alloc", BT_FN_PTR_SIZE_SIZE, ATTR_MALLOC_SIZE_1_NOTHROW_LIST) DEF_LIB_BUILTIN (BUILT_IN_ASIN, "asin", BT_FN_DOUBLE_DOUBLE, ATTR_MATHFN_FPROUNDING_ERRNO) DEF_C99_C90RES_BUILTIN (BUILT_IN_ASINF, "asinf", BT_FN_FLOAT_FLOAT, ATTR_MATHFN_FPROUNDING_ERRNO) DEF_C99_BUILTIN (BUILT_IN_ASINH, "asinh", BT_FN_DOUBLE_DOUBLE, ATTR_MATHFN_FPROUNDING) @@ -777,7 +777,7 @@ DEF_GCC_BUILTIN (BUILT_IN_UMULLL_OVERFLOW, "umulll_overflow", BT_FN_BOOL_ DEF_LIB_BUILTIN (BUILT_IN_ABORT, "abort", BT_FN_VOID, ATTR_TMPURE_NORETURN_NOTHROW_LEAF_LIST) DEF_LIB_BUILTIN (BUILT_IN_ABS, "abs", BT_FN_INT_INT, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_AGGREGATE_INCOMING_ADDRESS, "aggregate_incoming_address", BT_FN_PTR_VAR, ATTR_LEAF_LIST) -DEF_EXT_LIB_BUILTIN (BUILT_IN_ALLOCA, "alloca", BT_FN_PTR_SIZE, ATTR_MALLOC_NOTHROW_LEAF_LIST) +DEF_EXT_LIB_BUILTIN (BUILT_IN_ALLOCA, "alloca", BT_FN_PTR_SIZE, ATTR_ALLOCA_SIZE_1_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_APPLY, "apply", BT_FN_PTR_PTR_FN_VOID_VAR_PTR_SIZE, ATTR_NULL) DEF_GCC_BUILTIN (BUILT_IN_APPLY_ARGS, "apply_args", BT_FN_PTR_VAR, ATTR_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_BSWAP16, "bswap16", BT_FN_UINT16_UINT16, ATTR_CONST_NOTHROW_LEAF_LIST) @@ -785,7 +785,7 @@ DEF_GCC_BUILTIN (BUILT_IN_BSWAP32, "bswap32", BT_FN_UINT32_UINT32, ATTR_C DEF_GCC_BUILTIN (BUILT_IN_BSWAP64, "bswap64", BT_FN_UINT64_UINT64, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_EXT_LIB_BUILTIN (BUILT_IN_CLEAR_CACHE, "__clear_cache", BT_FN_VOID_PTR_PTR, ATTR_NOTHROW_LEAF_LIST) /* [trans-mem]: Adjust BUILT_IN_TM_CALLOC if BUILT_IN_CALLOC is changed. */ -DEF_LIB_BUILTIN (BUILT_IN_CALLOC, "calloc", BT_FN_PTR_SIZE_SIZE, ATTR_MALLOC_NOTHROW_LEAF_LIST) +DEF_LIB_BUILTIN (BUILT_IN_CALLOC, "calloc", BT_FN_PTR_SIZE_SIZE, ATTR_MALLOC_SIZE_1_2_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_CLASSIFY_TYPE, "classify_type", BT_FN_INT_VAR, ATTR_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_CLZ, "clz", BT_FN_INT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_CLZIMAX, "clzimax", BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST) @@ -861,7 +861,7 @@ DEF_LIB_BUILTIN (BUILT_IN_LABS, "labs", BT_FN_LONG_LONG, ATTR_CONST_NOTHR DEF_C99_BUILTIN (BUILT_IN_LLABS, "llabs", BT_FN_LONGLONG_LONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_LONGJMP, "longjmp", BT_FN_VOID_PTR_INT, ATTR_NORETURN_NOTHROW_LIST) /* [trans-mem]: Adjust BUILT_IN_TM_MALLOC if BUILT_IN_MALLOC is changed. */ -DEF_LIB_BUILTIN (BUILT_IN_MALLOC, "malloc", BT_FN_PTR_SIZE, ATTR_MALLOC_NOTHROW_LEAF_LIST) +DEF_LIB_BUILTIN (BUILT_IN_MALLOC, "malloc", BT_FN_PTR_SIZE, ATTR_MALLOC_SIZE_1_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_NEXT_ARG, "next_arg", BT_FN_PTR_VAR, ATTR_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_PARITY, "parity", BT_FN_INT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_PARITYIMAX, "parityimax", BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST) @@ -873,7 +873,7 @@ DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTL, "popcountl", BT_FN_INT_ULONG, ATTR_C DEF_GCC_BUILTIN (BUILT_IN_POPCOUNTLL, "popcountll", BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST) DEF_EXT_LIB_BUILTIN (BUILT_IN_POSIX_MEMALIGN, "posix_memalign", BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF) DEF_GCC_BUILTIN (BUILT_IN_PREFETCH, "prefetch", BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST) -DEF_LIB_BUILTIN (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_NOTHROW_LEAF_LIST) +DEF_LIB_BUILTIN (BUILT_IN_REALLOC, "realloc", BT_FN_PTR_PTR_SIZE, ATTR_ALLOC_SIZE_2_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_RETURN, "return", BT_FN_VOID_PTR, ATTR_NORETURN_NOTHROW_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_RETURN_ADDRESS, "return_address", BT_FN_PTR_UINT, ATTR_LEAF_LIST) DEF_GCC_BUILTIN (BUILT_IN_SAVEREGS, "saveregs", BT_FN_PTR_VAR, ATTR_NULL) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index a323678..838fca5 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-04 Martin Sebor + + PR c/78668 + * gcc.dg/builtin-alloc-size.c: New test. + 2016-12-04 Uros Bizjak PR target/70322 diff --git a/gcc/testsuite/gcc.dg/builtin-alloc-size.c b/gcc/testsuite/gcc.dg/builtin-alloc-size.c new file mode 100644 index 0000000..5a40862 --- /dev/null +++ b/gcc/testsuite/gcc.dg/builtin-alloc-size.c @@ -0,0 +1,72 @@ +/* PR c/78668 - aligned_alloc, realloc, et al. missing attribute alloc_size + Test to verify that memory allocation built-ins are decorated with + attribute alloc_size that __builtin_object_size can make use of (or + are treated as if they were for that purpose).. + { dg-do compile } + { dg-additional-options "-O2 -fdump-tree-optimized" } */ + +void sink (void*); + +unsigned size (unsigned n) +{ + return n; +} + +void test_aligned_alloc (unsigned a) +{ + unsigned n = size (7); + + void *p = __builtin_aligned_alloc (n, a); + if (__builtin_object_size (p, 0) != n) + __builtin_abort (); + sink (p); +} + +void test_alloca (void) +{ + unsigned n = size (13); + + void *p = __builtin_alloca (n); + + /* Also verify that alloca is declared with attribute returns_nonnull + (or treated as it were as the case may be). */ + if (!p) + __builtin_abort (); + + if (__builtin_object_size (p, 0) != n) + __builtin_abort (); + sink (p); +} + +void test_calloc (void) +{ + unsigned m = size (19); + unsigned n = size (23); + + void *p = __builtin_calloc (m, n); + if (__builtin_object_size (p, 0) != m * n) + __builtin_abort (); + sink (p); +} + +void test_malloc (void) +{ + unsigned n = size (17); + + void *p = __builtin_malloc (n); + if (__builtin_object_size (p, 0) != n) + __builtin_abort (); + sink (p); +} + +void test_realloc (void *p) +{ + unsigned n = size (31); + + p = __builtin_realloc (p, n); + if (__builtin_object_size (p, 0) != n) + __builtin_abort (); + sink (p); +} + +/* { dg-final { scan-tree-dump-not "abort" "optimized" } } */ -- cgit v1.1 From aa7cfe40579670e5bc0df3220181aad0bee763ab Mon Sep 17 00:00:00 2001 From: Janus Weil Date: Sun, 4 Dec 2016 21:31:26 +0100 Subject: re PR fortran/78618 (ICE in gfc_check_rank, at fortran/check.c:3670) 2016-12-04 Janus Weil PR fortran/78618 * intrinsic.c (gfc_convert_type_warn): Do not set the full typespec for the conversion symbol, but only type and kind. Set the full typespec for the expression. (gfc_convert_chartype): Ditto. From-SVN: r243232 --- gcc/fortran/ChangeLog | 8 ++++++++ gcc/fortran/intrinsic.c | 8 ++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index 3489bc4..2c06b31 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,3 +1,11 @@ +2016-12-04 Janus Weil + + PR fortran/78618 + * intrinsic.c (gfc_convert_type_warn): Do not set the full typespec for + the conversion symbol, but only type and kind. Set the full typespec + for the expression. + (gfc_convert_chartype): Ditto. + 2016-12-03 Janus Weil PR fortran/43207 diff --git a/gcc/fortran/intrinsic.c b/gcc/fortran/intrinsic.c index fdc11d8..fb83402 100644 --- a/gcc/fortran/intrinsic.c +++ b/gcc/fortran/intrinsic.c @@ -4984,12 +4984,14 @@ gfc_convert_type_warn (gfc_expr *expr, gfc_typespec *ts, int eflag, int wflag) new_expr->value.function.name = sym->lib_name; new_expr->value.function.isym = sym; new_expr->where = old_where; + new_expr->ts = *ts; new_expr->rank = rank; new_expr->shape = gfc_copy_shape (shape, rank); gfc_get_ha_sym_tree (sym->name, &new_expr->symtree); new_expr->symtree->n.sym->result = new_expr->symtree->n.sym; - new_expr->symtree->n.sym->ts = *ts; + new_expr->symtree->n.sym->ts.type = ts->type; + new_expr->symtree->n.sym->ts.kind = ts->kind; new_expr->symtree->n.sym->attr.flavor = FL_PROCEDURE; new_expr->symtree->n.sym->attr.function = 1; new_expr->symtree->n.sym->attr.elemental = 1; @@ -5055,11 +5057,13 @@ gfc_convert_chartype (gfc_expr *expr, gfc_typespec *ts) new_expr->value.function.name = sym->lib_name; new_expr->value.function.isym = sym; new_expr->where = old_where; + new_expr->ts = *ts; new_expr->rank = rank; new_expr->shape = gfc_copy_shape (shape, rank); gfc_get_ha_sym_tree (sym->name, &new_expr->symtree); - new_expr->symtree->n.sym->ts = *ts; + new_expr->symtree->n.sym->ts.type = ts->type; + new_expr->symtree->n.sym->ts.kind = ts->kind; new_expr->symtree->n.sym->attr.flavor = FL_PROCEDURE; new_expr->symtree->n.sym->attr.function = 1; new_expr->symtree->n.sym->attr.elemental = 1; -- cgit v1.1 From b2a8d083c5f5a0484a1a228e0db7fa7b8cf59473 Mon Sep 17 00:00:00 2001 From: GCC Administrator Date: Mon, 5 Dec 2016 00:16:15 +0000 Subject: Daily bump. From-SVN: r243235 --- gcc/DATESTAMP | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP index f179554..1c033b9 100644 --- a/gcc/DATESTAMP +++ b/gcc/DATESTAMP @@ -1 +1 @@ -20161204 +20161205 -- cgit v1.1 From dc7342d2911eb77f0d6a0e2e65782b7f47acb5ad Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Mon, 5 Dec 2016 09:30:57 +0000 Subject: sparc-protos.h (sparc_splitdi_legitimate): Rename to... * config/sparc/sparc-protos.h (sparc_splitdi_legitimate): Rename to... (sparc_split_reg_mem_legitimate): ...this. (sparc_split_reg_mem): Declare. (sparc_split_mem_reg): Likewise. (sparc_split_regreg_legitimate): Rename to... (sparc_split_reg_reg_legitimate): ...this. * config/sparc/sparc.c (sparc_splitdi_legitimate): Rename to... (sparc_split_reg_mem_legitimate): ...this. (sparc_split_reg_mem): New function. (sparc_split_mem_reg): Likewise. (sparc_split_regreg_legitimate): Rename to... (sparc_split_reg_reg_legitimate): ...this. (sparc_split_reg_reg): New function. * config/sparc/sparc.md (lra): Remove "none" value. (enabled): Adjust to above change. (*movdi_insn_sp32): Remove new (r,T) alternative and reorder others. (DImode splitters): Adjust to above renamings and use new functions. (*movdf_insn_sp32): Remove new (r,T) alternative and reorder others. (DFmode splitters): Adjust to above renamings and use new functions. (*mov_insn_sp64): Replace C with Z constraint and use W constraint in conjunction with e. (*mov_insn_sp32): Remove new (r,T) alternative, add (o,Y) alternative and reorder others. (VM64:mode splitters): Adjust to above renamings and use new functions. From-SVN: r243238 --- gcc/ChangeLog | 27 +++ gcc/config/sparc/sparc-protos.h | 7 +- gcc/config/sparc/sparc.c | 80 +++++++-- gcc/config/sparc/sparc.md | 383 +++++++++++++++------------------------- 4 files changed, 242 insertions(+), 255 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c33ad2f..730044c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,30 @@ +2016-12-05 Eric Botcazou + + * config/sparc/sparc-protos.h (sparc_splitdi_legitimate): Rename to... + (sparc_split_reg_mem_legitimate): ...this. + (sparc_split_reg_mem): Declare. + (sparc_split_mem_reg): Likewise. + (sparc_split_regreg_legitimate): Rename to... + (sparc_split_reg_reg_legitimate): ...this. + * config/sparc/sparc.c (sparc_splitdi_legitimate): Rename to... + (sparc_split_reg_mem_legitimate): ...this. + (sparc_split_reg_mem): New function. + (sparc_split_mem_reg): Likewise. + (sparc_split_regreg_legitimate): Rename to... + (sparc_split_reg_reg_legitimate): ...this. + (sparc_split_reg_reg): New function. + * config/sparc/sparc.md (lra): Remove "none" value. + (enabled): Adjust to above change. + (*movdi_insn_sp32): Remove new (r,T) alternative and reorder others. + (DImode splitters): Adjust to above renamings and use new functions. + (*movdf_insn_sp32): Remove new (r,T) alternative and reorder others. + (DFmode splitters): Adjust to above renamings and use new functions. + (*mov_insn_sp64): Replace C with Z constraint and use W + constraint in conjunction with e. + (*mov_insn_sp32): Remove new (r,T) alternative, add (o,Y) + alternative and reorder others. + (VM64:mode splitters): Adjust to above renamings and use new functions. + 2016-12-04 Martin Sebor PR c/78668 diff --git a/gcc/config/sparc/sparc-protos.h b/gcc/config/sparc/sparc-protos.h index 4e23a44..931e693 100644 --- a/gcc/config/sparc/sparc-protos.h +++ b/gcc/config/sparc/sparc-protos.h @@ -68,8 +68,11 @@ extern void sparc_emit_call_insn (rtx, rtx); extern void sparc_defer_case_vector (rtx, rtx, int); extern bool sparc_expand_move (machine_mode, rtx *); extern void sparc_emit_set_symbolic_const64 (rtx, rtx, rtx); -extern int sparc_splitdi_legitimate (rtx, rtx); -extern int sparc_split_regreg_legitimate (rtx, rtx); +extern int sparc_split_reg_mem_legitimate (rtx, rtx); +extern void sparc_split_reg_mem (rtx, rtx, machine_mode); +extern void sparc_split_mem_reg (rtx, rtx, machine_mode); +extern int sparc_split_reg_reg_legitimate (rtx, rtx); +extern void sparc_split_reg_reg (rtx, rtx, machine_mode); extern const char *output_ubranch (rtx, rtx_insn *); extern const char *output_cbranch (rtx, rtx, int, int, int, rtx_insn *); extern const char *output_return (rtx_insn *); diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c index e17552a..a5537a1 100644 --- a/gcc/config/sparc/sparc.c +++ b/gcc/config/sparc/sparc.c @@ -8484,46 +8484,82 @@ order_regs_for_local_alloc (void) } /* Return 1 if REG and MEM are legitimate enough to allow the various - mem<-->reg splits to be run. */ + MEM<-->REG splits to be run. */ int -sparc_splitdi_legitimate (rtx reg, rtx mem) +sparc_split_reg_mem_legitimate (rtx reg, rtx mem) { /* Punt if we are here by mistake. */ gcc_assert (reload_completed); /* We must have an offsettable memory reference. */ - if (! offsettable_memref_p (mem)) + if (!offsettable_memref_p (mem)) return 0; /* If we have legitimate args for ldd/std, we do not want the split to happen. */ - if ((REGNO (reg) % 2) == 0 - && mem_min_alignment (mem, 8)) + if ((REGNO (reg) % 2) == 0 && mem_min_alignment (mem, 8)) return 0; /* Success. */ return 1; } -/* Like sparc_splitdi_legitimate but for REG <--> REG moves. */ +/* Split a REG <-- MEM move into a pair of moves in MODE. */ + +void +sparc_split_reg_mem (rtx dest, rtx src, machine_mode mode) +{ + rtx high_part = gen_highpart (mode, dest); + rtx low_part = gen_lowpart (mode, dest); + rtx word0 = adjust_address (src, mode, 0); + rtx word1 = adjust_address (src, mode, 4); + + if (reg_overlap_mentioned_p (high_part, word1)) + { + emit_move_insn_1 (low_part, word1); + emit_move_insn_1 (high_part, word0); + } + else + { + emit_move_insn_1 (high_part, word0); + emit_move_insn_1 (low_part, word1); + } +} + +/* Split a MEM <-- REG move into a pair of moves in MODE. */ + +void +sparc_split_mem_reg (rtx dest, rtx src, machine_mode mode) +{ + rtx word0 = adjust_address (dest, mode, 0); + rtx word1 = adjust_address (dest, mode, 4); + rtx high_part = gen_highpart (mode, src); + rtx low_part = gen_lowpart (mode, src); + + emit_move_insn_1 (word0, high_part); + emit_move_insn_1 (word1, low_part); +} + +/* Like sparc_split_reg_mem_legitimate but for REG <--> REG moves. */ int -sparc_split_regreg_legitimate (rtx reg1, rtx reg2) +sparc_split_reg_reg_legitimate (rtx reg1, rtx reg2) { - int regno1, regno2; + /* Punt if we are here by mistake. */ + gcc_assert (reload_completed); if (GET_CODE (reg1) == SUBREG) reg1 = SUBREG_REG (reg1); if (GET_CODE (reg1) != REG) return 0; - regno1 = REGNO (reg1); + const int regno1 = REGNO (reg1); if (GET_CODE (reg2) == SUBREG) reg2 = SUBREG_REG (reg2); if (GET_CODE (reg2) != REG) return 0; - regno2 = REGNO (reg2); + const int regno2 = REGNO (reg2); if (SPARC_INT_REG_P (regno1) && SPARC_INT_REG_P (regno2)) return 1; @@ -8538,6 +8574,30 @@ sparc_split_regreg_legitimate (rtx reg1, rtx reg2) return 0; } +/* Split a REG <--> REG move into a pair of moves in MODE. */ + +void +sparc_split_reg_reg (rtx dest, rtx src, machine_mode mode) +{ + rtx dest1 = gen_highpart (mode, dest); + rtx dest2 = gen_lowpart (mode, dest); + rtx src1 = gen_highpart (mode, src); + rtx src2 = gen_lowpart (mode, src); + + /* Now emit using the real source and destination we found, swapping + the order if we detect overlap. */ + if (reg_overlap_mentioned_p (dest1, src2)) + { + emit_move_insn_1 (dest2, src2); + emit_move_insn_1 (dest1, src1); + } + else + { + emit_move_insn_1 (dest1, src1); + emit_move_insn_1 (dest2, src2); + } +} + /* Return 1 if REGNO (reg1) is even and REGNO (reg1) == REGNO (reg2) - 1. This makes them candidates for using ldd and std insns. diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md index 000c5a3..767d508 100644 --- a/gcc/config/sparc/sparc.md +++ b/gcc/config/sparc/sparc.md @@ -254,14 +254,12 @@ (define_attr "cpu_feature" "none,fpu,fpunotv9,v9,vis,vis3,vis4" (const_string "none")) -(define_attr "lra" "none,disabled,enabled" - (const_string "none")) +(define_attr "lra" "disabled,enabled" + (const_string "enabled")) (define_attr "enabled" "" (cond [(eq_attr "cpu_feature" "none") - (cond [(eq_attr "lra" "disabled") (symbol_ref "!TARGET_LRA") - (eq_attr "lra" "enabled") (symbol_ref "TARGET_LRA")] - (const_int 1)) + (cond [(eq_attr "lra" "disabled") (symbol_ref "!TARGET_LRA")] (const_int 1)) (eq_attr "cpu_feature" "fpu") (symbol_ref "TARGET_FPU") (eq_attr "cpu_feature" "fpunotv9") (symbol_ref "TARGET_FPU && !TARGET_V9") (eq_attr "cpu_feature" "v9") (symbol_ref "TARGET_V9") @@ -1707,25 +1705,23 @@ (define_insn "*movdi_insn_sp32" [(set (match_operand:DI 0 "nonimmediate_operand" - "=T,o,T,T,U,r,o,r,r,r,?T,?*f,?*f,?o,?*e,?*e, r,?*f,?*e,?W,b,b") + "=T,o,U,T,r,o,r,r,?*f,?T,?*f,?o,?*e,?*e, r,?*f,?*e,?W,*b,*b") (match_operand:DI 1 "input_operand" - " J,J,U,r,T,T,r,o,i,r,*f, T, o,*f, *e, *e,?*f, r, W,*e,J,P"))] + " J,J,T,U,o,r,i,r, T,*f, o,*f, *e, *e,?*f, r, W,*e, J, P"))] "TARGET_ARCH32 && (register_operand (operands[0], DImode) || register_or_zero_operand (operands[1], DImode))" "@ - stx\t%%g0, %0 + stx\t%r1, %0 # - std\t%1, %0 - std\t%1, %0 ldd\t%1, %0 + std\t%1, %0 ldd\t%1, %0 + std\t%1, %0 # # - # - # - std\t%1, %0 ldd\t%1, %0 + std\t%1, %0 # # fmovd\t%1, %0 @@ -1736,12 +1732,12 @@ std\t%1, %0 fzero\t%0 fone\t%0" - [(set_attr "type" "store,store,store,store,load,load,*,*,*,*,fpstore,fpload,*,*,fpmove,*,*,*,fpload,fpstore,visl,visl") - (set_attr "length" "*,2,*,*,*,*,2,2,2,2,*,*,2,2,*,2,2,2,*,*,*,*") - (set_attr "fptype" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,double,*,*,*,*,*,double,double") - (set_attr "cpu_feature" "v9,*,*,*,*,*,*,*,*,*,fpu,fpu,fpu,fpu,v9,fpunotv9,vis3,vis3,fpu,fpu,vis,vis") - (set_attr "v3pipe" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,true,true") - (set_attr "lra" "*,*,disabled,enabled,disabled,enabled,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")]) + [(set_attr "type" "store,*,load,store,load,store,*,*,fpload,fpstore,*,*,fpmove,*,*,*,fpload,fpstore,visl,visl") + (set_attr "length" "*,2,*,*,*,*,2,2,*,*,2,2,*,2,2,2,*,*,*,*") + (set_attr "fptype" "*,*,*,*,*,*,*,*,*,*,*,*,double,*,*,*,*,*,double,double") + (set_attr "cpu_feature" "v9,*,*,*,*,*,*,*,fpu,fpu,fpu,fpu,v9,fpunotv9,vis3,vis3,fpu,fpu,vis,vis") + (set_attr "v3pipe" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,true,true") + (set_attr "lra" "*,*,disabled,disabled,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")]) (define_insn "*movdi_insn_sp64" [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,r, m, r,*e,?*e,?*e,?W,b,b") @@ -1997,30 +1993,27 @@ (define_split [(set (match_operand:DI 0 "register_operand" "") (match_operand:DI 1 "const_int_operand" ""))] - "TARGET_ARCH32 + "reload_completed + && TARGET_ARCH32 && ((GET_CODE (operands[0]) == REG && SPARC_INT_REG_P (REGNO (operands[0]))) || (GET_CODE (operands[0]) == SUBREG && GET_CODE (SUBREG_REG (operands[0])) == REG - && SPARC_INT_REG_P (REGNO (SUBREG_REG (operands[0]))))) - && reload_completed" + && SPARC_INT_REG_P (REGNO (SUBREG_REG (operands[0])))))" [(clobber (const_int 0))] { - HOST_WIDE_INT low, high; + HOST_WIDE_INT low = trunc_int_for_mode (INTVAL (operands[1]), SImode); + HOST_WIDE_INT high = trunc_int_for_mode (INTVAL (operands[1]) >> 32, SImode); + rtx high_part = gen_highpart (SImode, operands[0]); + rtx low_part = gen_lowpart (SImode, operands[0]); - low = trunc_int_for_mode (INTVAL (operands[1]), SImode); - high = trunc_int_for_mode (INTVAL (operands[1]) >> 32, SImode); - emit_insn (gen_movsi (gen_highpart (SImode, operands[0]), GEN_INT (high))); + emit_move_insn_1 (high_part, GEN_INT (high)); - /* Slick... but this trick loses if this subreg constant part - can be done in one insn. */ - if (low == high - && !SPARC_SETHI32_P (high) - && !SPARC_SIMM13_P (high)) - emit_insn (gen_movsi (gen_lowpart (SImode, operands[0]), - gen_highpart (SImode, operands[0]))); + /* Slick... but this loses if the constant can be done in one insn. */ + if (low == high && !SPARC_SETHI32_P (high) && !SPARC_SIMM13_P (high)) + emit_move_insn_1 (low_part, high_part); else - emit_insn (gen_movsi (gen_lowpart (SImode, operands[0]), GEN_INT (low))); + emit_move_insn_1 (low_part, GEN_INT (low)); DONE; }) @@ -2031,31 +2024,10 @@ "reload_completed && (!TARGET_V9 || (TARGET_ARCH32 - && sparc_split_regreg_legitimate (operands[0], operands[1])))" + && sparc_split_reg_reg_legitimate (operands[0], operands[1])))" [(clobber (const_int 0))] { - rtx set_dest = operands[0]; - rtx set_src = operands[1]; - rtx dest1, dest2; - rtx src1, src2; - - dest1 = gen_highpart (SImode, set_dest); - dest2 = gen_lowpart (SImode, set_dest); - src1 = gen_highpart (SImode, set_src); - src2 = gen_lowpart (SImode, set_src); - - /* Now emit using the real source and destination we found, swapping - the order if we detect overlap. */ - if (reg_overlap_mentioned_p (dest1, src2)) - { - emit_insn (gen_movsi (dest2, src2)); - emit_insn (gen_movsi (dest1, src1)); - } - else - { - emit_insn (gen_movsi (dest1, src1)); - emit_insn (gen_movsi (dest2, src2)); - } + sparc_split_reg_reg (operands[0], operands[1], SImode); DONE; }) @@ -2064,41 +2036,24 @@ (define_split [(set (match_operand:DI 0 "register_operand" "") (match_operand:DI 1 "memory_operand" ""))] - "(TARGET_ARCH32 - && reload_completed - && sparc_splitdi_legitimate (operands[0], operands[1]))" + "reload_completed + && TARGET_ARCH32 + && sparc_split_reg_mem_legitimate (operands[0], operands[1])" [(clobber (const_int 0))] { - rtx word0 = adjust_address (operands[1], SImode, 0); - rtx word1 = adjust_address (operands[1], SImode, 4); - rtx high_part = gen_highpart (SImode, operands[0]); - rtx low_part = gen_lowpart (SImode, operands[0]); - - if (reg_overlap_mentioned_p (high_part, word1)) - { - emit_insn (gen_movsi (low_part, word1)); - emit_insn (gen_movsi (high_part, word0)); - } - else - { - emit_insn (gen_movsi (high_part, word0)); - emit_insn (gen_movsi (low_part, word1)); - } + sparc_split_reg_mem (operands[0], operands[1], SImode); DONE; }) (define_split [(set (match_operand:DI 0 "memory_operand" "") (match_operand:DI 1 "register_operand" ""))] - "(TARGET_ARCH32 - && reload_completed - && sparc_splitdi_legitimate (operands[1], operands[0]))" + "reload_completed + && TARGET_ARCH32 + && sparc_split_reg_mem_legitimate (operands[1], operands[0])" [(clobber (const_int 0))] { - emit_insn (gen_movsi (adjust_address (operands[0], SImode, 0), - gen_highpart (SImode, operands[1]))); - emit_insn (gen_movsi (adjust_address (operands[0], SImode, 4), - gen_lowpart (SImode, operands[1]))); + sparc_split_mem_reg (operands[0], operands[1], SImode); DONE; }) @@ -2112,8 +2067,8 @@ && offsettable_memref_p (operands[0])" [(clobber (const_int 0))] { - emit_insn (gen_movsi (adjust_address (operands[0], SImode, 0), const0_rtx)); - emit_insn (gen_movsi (adjust_address (operands[0], SImode, 4), const0_rtx)); + emit_move_insn_1 (adjust_address (operands[0], SImode, 0), const0_rtx); + emit_move_insn_1 (adjust_address (operands[0], SImode, 4), const0_rtx); DONE; }) @@ -2381,13 +2336,15 @@ (define_insn "*movdf_insn_sp32" [(set (match_operand:DF 0 "nonimmediate_operand" - "=b,b,e,e,*r, f, e,T,W,U,r,T,T, f, *r, o,o") + "=T,o,b,b,e,e,*r, f, e,W,U,T, f,o, *r,*r, o") (match_operand:DF 1 "input_operand" - " G,C,e,e, f,*r,W#F,G,e,T,T,U,r,o#F,*roF,*rG,f"))] + " G,G,G,C,e,e, f,*r,W#F,e,T,U,o#F,f,*rF, o,*r"))] "TARGET_ARCH32 && (register_operand (operands[0], DFmode) || register_or_zero_or_all_ones_operand (operands[1], DFmode))" "@ + stx\t%r1, %0 + # fzero\t%0 fone\t%0 fmovd\t%1, %0 @@ -2395,22 +2352,20 @@ # # ldd\t%1, %0 - stx\t%r1, %0 std\t%1, %0 ldd\t%1, %0 - ldd\t%1, %0 - std\t%1, %0 std\t%1, %0 # # # - #" - [(set_attr "type" "visl,visl,fpmove,*,*,*,fpload,store,fpstore,load,load,store,store,*,*,*,*") - (set_attr "length" "*,*,*,2,2,2,*,*,*,*,*,*,*,2,2,2,2") - (set_attr "fptype" "double,double,double,*,*,*,*,*,*,*,*,*,*,*,*,*,*") - (set_attr "cpu_feature" "vis,vis,v9,fpunotv9,vis3,vis3,fpu,v9,fpu,*,*,*,*,fpu,*,*,fpu") - (set_attr "v3pipe" "true,true,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*") - (set_attr "lra" "*,*,*,*,*,*,*,*,*,disabled,enabled,disabled,enabled,*,*,*,*")]) + ldd\t%1, %0 + std\t%1, %0" + [(set_attr "type" "store,*,visl,visl,fpmove,*,*,*,fpload,fpstore,load,store,*,*,*,load,store") + (set_attr "length" "*,2,*,*,*,2,2,2,*,*,*,*,2,2,2,*,*") + (set_attr "fptype" "*,*,double,double,double,*,*,*,*,*,*,*,*,*,*,*,*") + (set_attr "cpu_feature" "v9,*,vis,vis,v9,fpunotv9,vis3,vis3,fpu,fpu,*,*,fpu,fpu,*,*,*") + (set_attr "v3pipe" "*,*,true,true,*,*,*,*,*,*,*,*,*,*,*,*,*") + (set_attr "lra" "*,*,*,*,*,*,*,*,*,*,disabled,disabled,*,*,*,*,*")]) (define_insn "*movdf_insn_sp64" [(set (match_operand:DF 0 "nonimmediate_operand" "=b,b,e,*r, e, e,W, *r,*r, m,*r") @@ -2440,44 +2395,38 @@ (define_split [(set (match_operand:DF 0 "register_operand" "") (match_operand:DF 1 "const_double_operand" ""))] - "REG_P (operands[0]) + "reload_completed + && REG_P (operands[0]) && SPARC_INT_REG_P (REGNO (operands[0])) - && !const_zero_operand (operands[1], GET_MODE (operands[0])) - && reload_completed" + && !const_zero_operand (operands[1], GET_MODE (operands[0]))" [(clobber (const_int 0))] { operands[0] = gen_raw_REG (DImode, REGNO (operands[0])); if (TARGET_ARCH64) { - machine_mode mode = GET_MODE (operands[1]); - rtx tem = simplify_subreg (DImode, operands[1], mode, 0); + rtx tem = simplify_subreg (DImode, operands[1], DFmode, 0); emit_insn (gen_movdi (operands[0], tem)); } else { - machine_mode mode = GET_MODE (operands[1]); - rtx hi = simplify_subreg (SImode, operands[1], mode, 0); - rtx lo = simplify_subreg (SImode, operands[1], mode, 4); + rtx hi = simplify_subreg (SImode, operands[1], DFmode, 0); + rtx lo = simplify_subreg (SImode, operands[1], DFmode, 4); + rtx high_part = gen_highpart (SImode, operands[0]); + rtx low_part = gen_lowpart (SImode, operands[0]); gcc_assert (GET_CODE (hi) == CONST_INT); gcc_assert (GET_CODE (lo) == CONST_INT); - emit_insn (gen_movsi (gen_highpart (SImode, operands[0]), hi)); + emit_move_insn_1 (high_part, hi); - /* Slick... but this trick loses if this subreg constant part - can be done in one insn. */ + /* Slick... but this loses if the constant can be done in one insn. */ if (lo == hi && !SPARC_SETHI32_P (INTVAL (hi)) && !SPARC_SIMM13_P (INTVAL (hi))) - { - emit_insn (gen_movsi (gen_lowpart (SImode, operands[0]), - gen_highpart (SImode, operands[0]))); - } + emit_move_insn_1 (low_part, high_part); else - { - emit_insn (gen_movsi (gen_lowpart (SImode, operands[0]), lo)); - } + emit_move_insn_1 (low_part, lo); } DONE; }) @@ -2489,35 +2438,31 @@ ;; register DFmode cases must be handled. (define_split [(set (match_operand:DF 0 "register_operand" "") - (match_operand:DF 1 "register_operand" ""))] - "(!TARGET_V9 - || (TARGET_ARCH32 - && sparc_split_regreg_legitimate (operands[0], operands[1]))) - && reload_completed" + (match_operand:DF 1 "const_zero_operand" ""))] + "reload_completed + && TARGET_ARCH32 + && ((GET_CODE (operands[0]) == REG + && SPARC_INT_REG_P (REGNO (operands[0]))) + || (GET_CODE (operands[0]) == SUBREG + && GET_CODE (SUBREG_REG (operands[0])) == REG + && SPARC_INT_REG_P (REGNO (SUBREG_REG (operands[0])))))" [(clobber (const_int 0))] { - rtx set_dest = operands[0]; - rtx set_src = operands[1]; - rtx dest1, dest2; - rtx src1, src2; - - dest1 = gen_highpart (SFmode, set_dest); - dest2 = gen_lowpart (SFmode, set_dest); - src1 = gen_highpart (SFmode, set_src); - src2 = gen_lowpart (SFmode, set_src); + emit_move_insn_1 (gen_highpart (SFmode, operands[0]), CONST0_RTX (SFmode)); + emit_move_insn_1 (gen_lowpart (SFmode, operands[0]), CONST0_RTX (SFmode)); + DONE; +}) - /* Now emit using the real source and destination we found, swapping - the order if we detect overlap. */ - if (reg_overlap_mentioned_p (dest1, src2)) - { - emit_move_insn_1 (dest2, src2); - emit_move_insn_1 (dest1, src1); - } - else - { - emit_move_insn_1 (dest1, src1); - emit_move_insn_1 (dest2, src2); - } +(define_split + [(set (match_operand:DF 0 "register_operand" "") + (match_operand:DF 1 "register_operand" ""))] + "reload_completed + && (!TARGET_V9 + || (TARGET_ARCH32 + && sparc_split_reg_reg_legitimate (operands[0], operands[1])))" + [(clobber (const_int 0))] +{ + sparc_split_reg_reg (operands[0], operands[1], SFmode); DONE; }) @@ -2526,26 +2471,10 @@ (match_operand:DF 1 "memory_operand" ""))] "reload_completed && TARGET_ARCH32 - && (((REGNO (operands[0]) % 2) != 0) - || !mem_min_alignment (operands[1], 8)) - && offsettable_memref_p (operands[1])" + && sparc_split_reg_mem_legitimate (operands[0], operands[1])" [(clobber (const_int 0))] { - rtx word0, word1; - - word0 = adjust_address (operands[1], SFmode, 0); - word1 = adjust_address (operands[1], SFmode, 4); - - if (reg_overlap_mentioned_p (gen_highpart (SFmode, operands[0]), word1)) - { - emit_move_insn_1 (gen_lowpart (SFmode, operands[0]), word1); - emit_move_insn_1 (gen_highpart (SFmode, operands[0]), word0); - } - else - { - emit_move_insn_1 (gen_highpart (SFmode, operands[0]), word0); - emit_move_insn_1 (gen_lowpart (SFmode, operands[0]), word1); - } + sparc_split_reg_mem (operands[0], operands[1], SFmode); DONE; }) @@ -2554,18 +2483,10 @@ (match_operand:DF 1 "register_operand" ""))] "reload_completed && TARGET_ARCH32 - && (((REGNO (operands[1]) % 2) != 0) - || !mem_min_alignment (operands[0], 8)) - && offsettable_memref_p (operands[0])" + && sparc_split_reg_mem_legitimate (operands[1], operands[0])" [(clobber (const_int 0))] { - rtx word0, word1; - - word0 = adjust_address (operands[0], SFmode, 0); - word1 = adjust_address (operands[0], SFmode, 4); - - emit_move_insn_1 (word0, gen_highpart (SFmode, operands[1])); - emit_move_insn_1 (word1, gen_lowpart (SFmode, operands[1])); + sparc_split_mem_reg (operands[0], operands[1], SFmode); DONE; }) @@ -2579,35 +2500,8 @@ && offsettable_memref_p (operands[0])" [(clobber (const_int 0))] { - rtx dest1, dest2; - - dest1 = adjust_address (operands[0], SFmode, 0); - dest2 = adjust_address (operands[0], SFmode, 4); - - emit_move_insn_1 (dest1, CONST0_RTX (SFmode)); - emit_move_insn_1 (dest2, CONST0_RTX (SFmode)); - DONE; -}) - -(define_split - [(set (match_operand:DF 0 "register_operand" "") - (match_operand:DF 1 "const_zero_operand" ""))] - "reload_completed - && TARGET_ARCH32 - && ((GET_CODE (operands[0]) == REG - && SPARC_INT_REG_P (REGNO (operands[0]))) - || (GET_CODE (operands[0]) == SUBREG - && GET_CODE (SUBREG_REG (operands[0])) == REG - && SPARC_INT_REG_P (REGNO (SUBREG_REG (operands[0])))))" - [(clobber (const_int 0))] -{ - rtx set_dest = operands[0]; - rtx dest1, dest2; - - dest1 = gen_highpart (SFmode, set_dest); - dest2 = gen_lowpart (SFmode, set_dest); - emit_move_insn_1 (dest1, CONST0_RTX (SFmode)); - emit_move_insn_1 (dest2, CONST0_RTX (SFmode)); + emit_move_insn_1 (adjust_address (operands[0], SFmode, 0), CONST0_RTX (SFmode)); + emit_move_insn_1 (adjust_address (operands[0], SFmode, 4), CONST0_RTX (SFmode)); DONE; }) @@ -8625,8 +8519,8 @@ (set_attr "v3pipe" "true,true,true,*,*,*,*,*,*,true,true")]) (define_insn "*mov_insn_sp64" - [(set (match_operand:VM64 0 "nonimmediate_operand" "=e,e,e,e,m,m,*r, m,*r, e,*r") - (match_operand:VM64 1 "input_operand" "Y,C,e,m,e,Y, m,*r, e,*r,*r"))] + [(set (match_operand:VM64 0 "nonimmediate_operand" "=e,e,e,e,W,m,*r, m,*r, e,*r") + (match_operand:VM64 1 "input_operand" "Y,Z,e,W,e,Y, m,*r, e,*r,*r"))] "TARGET_VIS && TARGET_ARCH64 && (register_operand (operands[0], mode) @@ -8648,13 +8542,17 @@ (set_attr "v3pipe" "true,true,true,*,*,*,*,*,*,*,*")]) (define_insn "*mov_insn_sp32" - [(set (match_operand:VM64 0 "nonimmediate_operand" "=e,e,e,*r, f,e,m,m,U,r,T,T, o,*r") - (match_operand:VM64 1 "input_operand" "Y,C,e, f,*r,m,e,Y,T,T,U,r,*r,*r"))] + [(set (match_operand:VM64 0 "nonimmediate_operand" + "=T,o,e,e,e,*r, f,e,W,U,T,e,o,*r,*r, o") + (match_operand:VM64 1 "input_operand" + " Y,Y,Y,Z,e, f,*r,W,e,T,U,o,e,*r, o,*r"))] "TARGET_VIS && TARGET_ARCH32 && (register_operand (operands[0], mode) || register_or_zero_or_all_ones_operand (operands[1], mode))" "@ + stx\t%r1, %0 + # fzero\t%0 fone\t%0 fsrc2\t%1, %0 @@ -8662,71 +8560,70 @@ # ldd\t%1, %0 std\t%1, %0 - stx\t%r1, %0 - ldd\t%1, %0 ldd\t%1, %0 std\t%1, %0 - std\t%1, %0 # - #" - [(set_attr "type" "visl,visl,vismv,*,*,fpload,fpstore,store,load,load,store,store,*,*") - (set_attr "length" "*,*,*,2,2,*,*,*,*,*,*,*,2,2") - (set_attr "cpu_feature" "vis,vis,vis,vis3,vis3,*,*,*,*,*,*,*,*,*") - (set_attr "v3pipe" "true,true,true,*,*,*,*,*,*,*,*,*,*,*") - (set_attr "lra" "*,*,*,*,*,*,*,*,disabled,enabled,disabled,enabled,*,*")]) + # + # + ldd\t%1, %0 + std\t%1, %0" + [(set_attr "type" "store,*,visl,visl,vismv,*,*,fpload,fpstore,load,store,*,*,*,load,store") + (set_attr "length" "*,2,*,*,*,2,2,*,*,*,*,2,2,2,*,*") + (set_attr "cpu_feature" "*,*,vis,vis,vis,vis3,vis3,*,*,*,*,*,*,*,*,*") + (set_attr "v3pipe" "*,*,true,true,true,*,*,*,*,*,*,*,*,*,*,*") + (set_attr "lra" "*,*,*,*,*,*,*,*,*,disabled,disabled,*,*,*,*,*")]) (define_split - [(set (match_operand:VM64 0 "memory_operand" "") + [(set (match_operand:VM64 0 "register_operand" "") (match_operand:VM64 1 "register_operand" ""))] "reload_completed && TARGET_VIS && TARGET_ARCH32 - && (((REGNO (operands[1]) % 2) != 0) - || !mem_min_alignment (operands[0], 8)) - && offsettable_memref_p (operands[0])" + && sparc_split_reg_reg_legitimate (operands[0], operands[1])" [(clobber (const_int 0))] { - rtx word0, word1; - - word0 = adjust_address (operands[0], SImode, 0); - word1 = adjust_address (operands[0], SImode, 4); - - emit_move_insn_1 (word0, gen_highpart (SImode, operands[1])); - emit_move_insn_1 (word1, gen_lowpart (SImode, operands[1])); + sparc_split_reg_reg (operands[0], operands[1], SImode); DONE; }) (define_split [(set (match_operand:VM64 0 "register_operand" "") - (match_operand:VM64 1 "register_operand" ""))] + (match_operand:VM64 1 "memory_operand" ""))] "reload_completed && TARGET_VIS && TARGET_ARCH32 - && sparc_split_regreg_legitimate (operands[0], operands[1])" + && sparc_split_reg_mem_legitimate (operands[0], operands[1])" [(clobber (const_int 0))] { - rtx set_dest = operands[0]; - rtx set_src = operands[1]; - rtx dest1, dest2; - rtx src1, src2; + sparc_split_reg_mem (operands[0], operands[1], SImode); + DONE; +}) - dest1 = gen_highpart (SImode, set_dest); - dest2 = gen_lowpart (SImode, set_dest); - src1 = gen_highpart (SImode, set_src); - src2 = gen_lowpart (SImode, set_src); +(define_split + [(set (match_operand:VM64 0 "memory_operand" "") + (match_operand:VM64 1 "register_operand" ""))] + "reload_completed + && TARGET_VIS + && TARGET_ARCH32 + && sparc_split_reg_mem_legitimate (operands[1], operands[0])" + [(clobber (const_int 0))] +{ + sparc_split_mem_reg (operands[0], operands[1], SImode); + DONE; +}) - /* Now emit using the real source and destination we found, swapping - the order if we detect overlap. */ - if (reg_overlap_mentioned_p (dest1, src2)) - { - emit_insn (gen_movsi (dest2, src2)); - emit_insn (gen_movsi (dest1, src1)); - } - else - { - emit_insn (gen_movsi (dest1, src1)); - emit_insn (gen_movsi (dest2, src2)); - } +(define_split + [(set (match_operand:VM64 0 "memory_operand" "") + (match_operand:VM64 1 "const_zero_operand" ""))] + "reload_completed + && TARGET_VIS + && TARGET_ARCH32 + && !mem_min_alignment (operands[0], 8) + && offsettable_memref_p (operands[0])" + [(clobber (const_int 0))] +{ + emit_move_insn_1 (adjust_address (operands[0], SImode, 0), const0_rtx); + emit_move_insn_1 (adjust_address (operands[0], SImode, 4), const0_rtx); DONE; }) -- cgit v1.1 From 648e17d28df124d8eac937babae804262d605499 Mon Sep 17 00:00:00 2001 From: James Greenhalgh Date: Mon, 5 Dec 2016 09:35:28 +0000 Subject: [Patch 2/2 PR78561] Recalculate constant pool size before emitting it gcc/testsuite/ PR rtl-optimization/78561 * gcc.target/aarch64/pr78561.c: Add missing testcase from r243183. From-SVN: r243239 --- gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/gcc.target/aarch64/pr78561.c | 9 +++++++++ 2 files changed, 14 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/pr78561.c diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 838fca5..2fe40d0 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-04 James Greenhalgh + + PR rtl-optimization/78561 + * gcc.target/aarch64/pr78561.c: Add missing testcase from r243183. + 2016-12-04 Martin Sebor PR c/78668 diff --git a/gcc/testsuite/gcc.target/aarch64/pr78561.c b/gcc/testsuite/gcc.target/aarch64/pr78561.c new file mode 100644 index 0000000..048d2d7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr78561.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-Og -O3 -mcmodel=tiny" } */ + +int +main (__fp16 x) +{ + __fp16 a = 6.5504e4; + return (x <= a); +} -- cgit v1.1 From 5ffd5f36962aa5d2984f7453b553ef8076bd2e84 Mon Sep 17 00:00:00 2001 From: Andre Vieira Date: Mon, 5 Dec 2016 09:44:24 +0000 Subject: Fix arm-netbsdelf bootstrap. 2016-12-025 Andre Vieira * config/arm/arm.c (TARGET_ASM_INIT_SECTIONS): Fix wrong undef location. From-SVN: r243240 --- gcc/ChangeLog | 5 +++++ gcc/config/arm/arm.c | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 730044c..5d66a96 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-025 Andre Vieira + + * config/arm/arm.c (TARGET_ASM_INIT_SECTIONS): Fix wrong undef + location. + 2016-12-05 Eric Botcazou * config/sparc/sparc-protos.h (sparc_splitdi_legitimate): Rename to... diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index ec1f5fc..437da6f 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -595,8 +595,9 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_ASM_EMIT_EXCEPT_PERSONALITY #define TARGET_ASM_EMIT_EXCEPT_PERSONALITY arm_asm_emit_except_personality -#undef TARGET_ASM_INIT_SECTIONS #endif /* ARM_UNWIND_INFO */ + +#undef TARGET_ASM_INIT_SECTIONS #define TARGET_ASM_INIT_SECTIONS arm_asm_init_sections #undef TARGET_DWARF_REGISTER_SPAN -- cgit v1.1 From aad6838ec786662b6ffb28c494564aba29ad1a4e Mon Sep 17 00:00:00 2001 From: Eric Botcazou Date: Mon, 5 Dec 2016 11:15:17 +0000 Subject: system-darwin-ppc.ads (Support_Atomic_Primitives): Set to True only if the word size is 64. * system-darwin-ppc.ads (Support_Atomic_Primitives): Set to True only if the word size is 64. From-SVN: r243243 --- gcc/ada/ChangeLog | 5 +++++ gcc/ada/system-darwin-ppc.ads | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/gcc/ada/ChangeLog b/gcc/ada/ChangeLog index 3f221d2..1a9f161 100644 --- a/gcc/ada/ChangeLog +++ b/gcc/ada/ChangeLog @@ -1,3 +1,8 @@ +2016-12-05 Eric Botcazou + + * system-darwin-ppc.ads (Support_Atomic_Primitives): Set to True only + if the word size is 64. + 2016-11-30 Gary Dismukes * sem_prag.adb, sem_ch6.adb: Minor reformatting and typo fixes. diff --git a/gcc/ada/system-darwin-ppc.ads b/gcc/ada/system-darwin-ppc.ads index b58cfed..7809e14 100644 --- a/gcc/ada/system-darwin-ppc.ads +++ b/gcc/ada/system-darwin-ppc.ads @@ -161,7 +161,7 @@ private Stack_Check_Probes : constant Boolean := False; Stack_Check_Limits : constant Boolean := False; Support_Aggregates : constant Boolean := True; - Support_Atomic_Primitives : constant Boolean := True; + Support_Atomic_Primitives : constant Boolean := Word_Size = 64; Support_Composite_Assign : constant Boolean := True; Support_Composite_Compare : constant Boolean := True; Support_Long_Shifts : constant Boolean := True; -- cgit v1.1 From 5a5c5784d89008664ab42c17efcab7198b132456 Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Mon, 5 Dec 2016 12:16:38 +0100 Subject: [ARC] Remove unused patterns, refactor unspec+offset pattern gen. 2016-12-05 Claudiu Zissulescu * config/arc/arc-protos.h (insn_is_tls_gd_dispatch): Remove. * config/arc/arc.c (arc_unspec_offset): New function. (arc_finalize_pic): Change. (arc_emit_call_tls_get_addr): Likewise. (arc_legitimize_tls_address): Likewise. (arc_legitimize_pic_address): Likewise. (insn_is_tls_gd_dispatch): Remove. * config/arc/arc.h (INSN_REFERENCES_ARE_DELAYED): Change. * config/arc/arc.md (ls_gd_load): Remove unused pattern. (tls_gd_dispatch): Likewise. From-SVN: r243244 --- gcc/ChangeLog | 13 +++++++++++++ gcc/config/arc/arc-protos.h | 1 - gcc/config/arc/arc.c | 41 ++++++++++++++++++----------------------- gcc/config/arc/arc.h | 2 +- gcc/config/arc/arc.md | 34 ---------------------------------- 5 files changed, 32 insertions(+), 59 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5d66a96..c5095c2 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,16 @@ +2016-12-05 Claudiu Zissulescu + + * config/arc/arc-protos.h (insn_is_tls_gd_dispatch): Remove. + * config/arc/arc.c (arc_unspec_offset): New function. + (arc_finalize_pic): Change. + (arc_emit_call_tls_get_addr): Likewise. + (arc_legitimize_tls_address): Likewise. + (arc_legitimize_pic_address): Likewise. + (insn_is_tls_gd_dispatch): Remove. + * config/arc/arc.h (INSN_REFERENCES_ARE_DELAYED): Change. + * config/arc/arc.md (ls_gd_load): Remove unused pattern. + (tls_gd_dispatch): Likewise. + 2016-12-025 Andre Vieira * config/arm/arm.c (TARGET_ASM_INIT_SECTIONS): Fix wrong undef diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h index 83a0b73..bdbf7ce 100644 --- a/gcc/config/arc/arc-protos.h +++ b/gcc/config/arc/arc-protos.h @@ -121,6 +121,5 @@ extern int regno_clobbered_p (unsigned int, rtx_insn *, machine_mode, int); extern int arc_return_slot_offset (void); extern bool arc_legitimize_reload_address (rtx *, machine_mode, int, int); extern void arc_secondary_reload_conv (rtx, rtx, rtx, bool); -extern bool insn_is_tls_gd_dispatch (rtx_insn *); extern void arc_cpu_cpp_builtins (cpp_reader *); extern bool arc_store_addr_hazard_p (rtx_insn *, rtx_insn *); diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c index 832f567..a0aa16e 100644 --- a/gcc/config/arc/arc.c +++ b/gcc/config/arc/arc.c @@ -2771,6 +2771,15 @@ arc_return_slot_offset () /* PIC */ +/* Helper to generate unspec constant. */ + +static rtx +arc_unspec_offset (rtx loc, int unspec) +{ + return gen_rtx_CONST (Pmode, gen_rtx_UNSPEC (Pmode, gen_rtvec (1, loc), + unspec)); +} + /* Emit special PIC prologues and epilogues. */ /* If the function has any GOTOFF relocations, then the GOTBASE register has to be setup in the prologue @@ -2796,9 +2805,7 @@ arc_finalize_pic (void) gcc_assert (flag_pic != 0); pat = gen_rtx_SYMBOL_REF (Pmode, "_DYNAMIC"); - pat = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, pat), ARC_UNSPEC_GOT); - pat = gen_rtx_CONST (Pmode, pat); - + pat = arc_unspec_offset (pat, ARC_UNSPEC_GOT); pat = gen_rtx_SET (baseptr_rtx, pat); emit_insn (pat); @@ -4866,8 +4873,7 @@ arc_emit_call_tls_get_addr (rtx sym, int reloc, rtx eqv) start_sequence (); - rtx x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, sym), reloc); - x = gen_rtx_CONST (Pmode, x); + rtx x = arc_unspec_offset (sym, reloc); emit_move_insn (r0, x); use_reg (&call_fusage, r0); @@ -4923,17 +4929,18 @@ arc_legitimize_tls_address (rtx addr, enum tls_model model) addr = gen_rtx_CONST (Pmode, addr); base = arc_legitimize_tls_address (base, TLS_MODEL_GLOBAL_DYNAMIC); return gen_rtx_PLUS (Pmode, force_reg (Pmode, base), addr); + case TLS_MODEL_GLOBAL_DYNAMIC: return arc_emit_call_tls_get_addr (addr, UNSPEC_TLS_GD, addr); + case TLS_MODEL_INITIAL_EXEC: - addr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_TLS_IE); - addr = gen_rtx_CONST (Pmode, addr); + addr = arc_unspec_offset (addr, UNSPEC_TLS_IE); addr = copy_to_mode_reg (Pmode, gen_const_mem (Pmode, addr)); return gen_rtx_PLUS (Pmode, arc_get_tp (), addr); + case TLS_MODEL_LOCAL_EXEC: local_exec: - addr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_TLS_OFF); - addr = gen_rtx_CONST (Pmode, addr); + addr = arc_unspec_offset (addr, UNSPEC_TLS_OFF); return gen_rtx_PLUS (Pmode, arc_get_tp (), addr); default: gcc_unreachable (); @@ -4964,14 +4971,11 @@ arc_legitimize_pic_address (rtx orig, rtx oldx) else if (!flag_pic) return orig; else if (CONSTANT_POOL_ADDRESS_P (addr) || SYMBOL_REF_LOCAL_P (addr)) - return gen_rtx_CONST (Pmode, - gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), - ARC_UNSPEC_GOTOFFPC)); + return arc_unspec_offset (addr, ARC_UNSPEC_GOTOFFPC); /* This symbol must be referenced via a load from the Global Offset Table (@GOTPC). */ - pat = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), ARC_UNSPEC_GOT); - pat = gen_rtx_CONST (Pmode, pat); + pat = arc_unspec_offset (addr, ARC_UNSPEC_GOT); pat = gen_const_mem (Pmode, pat); if (oldx == NULL) @@ -9985,15 +9989,6 @@ arc_dwarf_register_span (rtx rtl) return p; } -/* We can't inline this in INSN_REFERENCES_ARE_DELAYED because - resource.h doesn't include the required header files. */ - -bool -insn_is_tls_gd_dispatch (rtx_insn *insn) -{ - return recog_memoized (insn) == CODE_FOR_tls_gd_dispatch; -} - /* Return true if OP is an acceptable memory operand for ARCompact 16-bit load instructions of MODE. diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h index 611ef54..64bd9e0 100644 --- a/gcc/config/arc/arc.h +++ b/gcc/config/arc/arc.h @@ -1612,7 +1612,7 @@ extern enum arc_function_type arc_compute_function_type (struct function *); && (get_attr_type (X) == TYPE_CALL || get_attr_type (X) == TYPE_SFUNC)) #define INSN_REFERENCES_ARE_DELAYED(insn) \ - (INSN_SETS_ARE_DELAYED (insn) && !insn_is_tls_gd_dispatch (insn)) + (INSN_SETS_ARE_DELAYED (insn)) #define CALL_ATTR(X, NAME) \ ((CALL_P (X) || NONJUMP_INSN_P (X)) \ diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index 925fcd6..18bc68f 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -5486,21 +5486,6 @@ [(set_attr "is_sfunc" "yes") (set_attr "predicable" "yes")]) -(define_insn "tls_gd_load" - [(set (match_operand:SI 0 "dest_reg_operand" "=Rcq#q,c") - (unspec:SI [(match_operand:SI 1 "register_operand" "Rcq#q,c") - (match_operand:SI 2 "symbolic_operand" "X,X")] - UNSPEC_TLS_GD))] - "" - ".tls_gd_ld %2`ld%? %0,[%1]" - [(set_attr "type" "load") - ; if the linker has to patch this into IE, we need a long insn - ; (FIXME: or two short insn, ld_s / jl_s. missing -Os optimization.) - (set_attr_alternative "iscompact" - [(cond [(ne (symbol_ref "arc_tp_regno == 30") (const_int 0)) - (const_string "*")] (const_string "maybe")) - (const_string "*")])]) - (define_insn "tls_gd_get_addr" [(set (reg:SI R0_REG) (call:SI (mem:SI (unspec:SI [(match_operand:SI 0 @@ -5514,25 +5499,6 @@ ; With TARGET_MEDIUM_CALLS, plt calls are not predicable. (set_attr "predicable" "no")]) -; We make this call specific to the tls symbol to avoid commoning this -; with calls for other symbols; we want the linker to be able to -(define_insn "tls_gd_dispatch" - [(set (reg:SI R0_REG) - (unspec:SI - [(reg:SI R0_REG) - (call (mem:SI (match_operand:SI 0 "register_operand" "Rcq,q,c")) - (const_int 0)) - (match_operand:SI 1 "symbolic_operand" "X,X,X")] - UNSPEC_TLS_GD)) - (clobber (reg:SI RETURN_ADDR_REGNUM)) - (clobber (reg:DI R10_REG)) - (clobber (reg:SI R12_REG))] - "" - ".tls_gd_call %1`jl%!%* [%0]" - [(set_attr "type" "call") - (set_attr "iscompact" "maybe,false,*") - (set_attr "predicable" "no,no,yes")]) - ;; For thread pointer builtins (define_expand "get_thread_pointersi" [(set (match_operand:SI 0 "register_operand") (match_dup 1))] -- cgit v1.1 From 62440b4f0e4d6e0dc193d315e79753bb4c5edd99 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Mon, 5 Dec 2016 11:16:52 +0000 Subject: [ARC] Fix PIE. gcc/ 2016-12-05 Cupertino Miranda * config/arc/arc.h (STARTFILE_SPEC): Use default linux specs. (ENDFILE_SPEC): Likewise. libgcc/ 2016-12-05 Cupertino Miranda * config.host (arc*-*-linux-uclibc*): Use default extra objects. Include linux-android header. * config/arc/crti.S (_init): Declare symbol as function. (_fini): Likewise. From-SVN: r243245 --- gcc/ChangeLog | 5 +++++ gcc/config.gcc | 2 +- gcc/config/arc/arc.h | 10 ++++------ libgcc/ChangeLog | 7 +++++++ libgcc/config.host | 4 ++-- libgcc/config/arc/crti.S | 2 ++ 6 files changed, 21 insertions(+), 9 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c5095c2..ad903f9 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-05 Cupertino Miranda + + * config/arc/arc.h (STARTFILE_SPEC): Use default linux specs. + (ENDFILE_SPEC): Likewise. + 2016-12-05 Claudiu Zissulescu * config/arc/arc-protos.h (insn_is_tls_gd_dispatch): Remove. diff --git a/gcc/config.gcc b/gcc/config.gcc index 189073e..e034bc3 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -1023,7 +1023,7 @@ arc*-*-elf*) ;; arc*-*-linux-uclibc*) extra_headers="arc-simd.h" - tm_file="arc/arc-arch.h dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h ${tm_file}" + tm_file="arc/arc-arch.h dbxelf.h elfos.h gnu-user.h linux.h linux-android.h glibc-stdint.h ${tm_file}" tmake_file="${tmake_file} arc/t-uClibc arc/t-arc" tm_defines="${tm_defines} TARGET_SDATA_DEFAULT=0" tm_defines="${tm_defines} TARGET_MMEDIUM_CALLS_DEFAULT=1" diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h index 64bd9e0..f9512c4 100644 --- a/gcc/config/arc/arc.h +++ b/gcc/config/arc/arc.h @@ -138,17 +138,15 @@ extern const char *arc_cpu_to_as (int argc, const char **argv); #define STARTFILE_SPEC "%{!shared:crt0.o%s} crti%O%s %{pg|p:crtg.o%s} " \ "%(arc_tls_extra_start_spec) crtbegin.o%s" #else -#define STARTFILE_SPEC "%{!shared:%{!mkernel:crt1.o%s}} crti.o%s \ - %{!shared:%{pg|p|profile:crtg.o%s} crtbegin.o%s} %{shared:crtbeginS.o%s}" - +#define STARTFILE_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, ANDROID_STARTFILE_SPEC) #endif #if DEFAULT_LIBC != LIBC_UCLIBC #define ENDFILE_SPEC "%{pg|p:crtgend.o%s} crtend.o%s crtn%O%s" #else -#define ENDFILE_SPEC "%{!shared:%{pg|p|profile:crtgend.o%s} crtend.o%s} \ - %{shared:crtendS.o%s} crtn.o%s" - +#define ENDFILE_SPEC \ + LINUX_OR_ANDROID_LD (GNU_USER_TARGET_ENDFILE_SPEC, ANDROID_ENDFILE_SPEC) #endif #if DEFAULT_LIBC == LIBC_UCLIBC diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog index efadedf..d4a1bad 100644 --- a/libgcc/ChangeLog +++ b/libgcc/ChangeLog @@ -1,3 +1,10 @@ +2016-12-05 Cupertino Miranda + + * config.host (arc*-*-linux-uclibc*): Use default extra + objects. Include linux-android header. + * config/arc/crti.S (_init): Declare symbol as function. + (_fini): Likewise. + 2016-12-03 Thomas Koenig PR fortran/78379 diff --git a/libgcc/config.host b/libgcc/config.host index e7e5413..b1a2be6 100644 --- a/libgcc/config.host +++ b/libgcc/config.host @@ -374,8 +374,8 @@ arc*-*-elf*) ;; arc*-*-linux-uclibc*) tmake_file="${tmake_file} t-slibgcc-libgcc t-slibgcc-nolc-override arc/t-arc700-uClibc arc/t-arc" - extra_parts="crti.o crtn.o crtend.o crtbegin.o crtendS.o crtbeginS.o libgmon.a crtg.o crtgend.o" - extra_parts="${extra_parts} crttls.o" + extra_parts="$extra_parts crti.o crtn.o libgmon.a crtg.o crtgend.o" + extra_parts="$extra_parts crttls.o" ;; arm-wrs-vxworks) tmake_file="$tmake_file arm/t-arm arm/t-elf t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp" diff --git a/libgcc/config/arc/crti.S b/libgcc/config/arc/crti.S index 7f64305..6867ca9 100644 --- a/libgcc/config/arc/crti.S +++ b/libgcc/config/arc/crti.S @@ -31,11 +31,13 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see .section .init .global _init .word 0 + .type _init,@function _init: push_s blink .section .fini .global _fini .word 0 + .type _fini,@function _fini: push_s blink -- cgit v1.1 From fb5e7daea59060603a2e526cab4e35b32a8e9438 Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Mon, 5 Dec 2016 12:27:55 +0100 Subject: re PR ada/48835 (porting GNAT to m68k-linux) PR ada/48835 * gcc-interface/Makefile.in: Add support for m68k-linux. * system-linux-m68k.ads: New file. From-SVN: r243247 --- gcc/ada/ChangeLog | 6 ++ gcc/ada/gcc-interface/Makefile.in | 29 +++++++ gcc/ada/system-linux-m68k.ads | 155 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 190 insertions(+) create mode 100644 gcc/ada/system-linux-m68k.ads diff --git a/gcc/ada/ChangeLog b/gcc/ada/ChangeLog index 1a9f161..858f8cb 100644 --- a/gcc/ada/ChangeLog +++ b/gcc/ada/ChangeLog @@ -1,3 +1,9 @@ +2016-12-05 Mikael Pettersson + + PR ada/48835 + * gcc-interface/Makefile.in: Add support for m68k-linux. + * system-linux-m68k.ads: New file. + 2016-12-05 Eric Botcazou * system-darwin-ppc.ads (Support_Atomic_Primitives): Set to True only diff --git a/gcc/ada/gcc-interface/Makefile.in b/gcc/ada/gcc-interface/Makefile.in index ec8aa07..98889c0 100644 --- a/gcc/ada/gcc-interface/Makefile.in +++ b/gcc/ada/gcc-interface/Makefile.in @@ -2049,6 +2049,35 @@ ifeq ($(strip $(filter-out hppa% linux%,$(target_cpu) $(target_os))),) LIBRARY_VERSION := $(LIB_VERSION) endif +# M68K Linux +ifeq ($(strip $(filter-out m68k% linux%,$(target_cpu) $(target_os))),) + LIBGNAT_TARGET_PAIRS = \ + a-intnam.ads. -- +-- -- +-- GNAT was originally developed by the GNAT team at New York University. -- +-- Extensive contributions were provided by Ada Core Technologies Inc. -- +-- -- +------------------------------------------------------------------------------ + +package System is + pragma Pure; + -- Note that we take advantage of the implementation permission to make + -- this unit Pure instead of Preelaborable; see RM 13.7.1(15). In Ada + -- 2005, this is Pure in any case (AI-362). + + type Name is (SYSTEM_NAME_GNAT); + System_Name : constant Name := SYSTEM_NAME_GNAT; + + -- System-Dependent Named Numbers + + Min_Int : constant := Long_Long_Integer'First; + Max_Int : constant := Long_Long_Integer'Last; + + Max_Binary_Modulus : constant := 2 ** Long_Long_Integer'Size; + Max_Nonbinary_Modulus : constant := 2 ** Integer'Size - 1; + + Max_Base_Digits : constant := Long_Long_Float'Digits; + Max_Digits : constant := Long_Long_Float'Digits; + + Max_Mantissa : constant := 63; + Fine_Delta : constant := 2.0 ** (-Max_Mantissa); + + Tick : constant := 0.000_001; + + -- Storage-related Declarations + + type Address is private; + pragma Preelaborable_Initialization (Address); + Null_Address : constant Address; + + Storage_Unit : constant := 8; + Word_Size : constant := 32; + Memory_Size : constant := 2 ** 32; + + -- Address comparison + + function "<" (Left, Right : Address) return Boolean; + function "<=" (Left, Right : Address) return Boolean; + function ">" (Left, Right : Address) return Boolean; + function ">=" (Left, Right : Address) return Boolean; + function "=" (Left, Right : Address) return Boolean; + + pragma Import (Intrinsic, "<"); + pragma Import (Intrinsic, "<="); + pragma Import (Intrinsic, ">"); + pragma Import (Intrinsic, ">="); + pragma Import (Intrinsic, "="); + + -- Other System-Dependent Declarations + + type Bit_Order is (High_Order_First, Low_Order_First); + Default_Bit_Order : constant Bit_Order := High_Order_First; + pragma Warnings (Off, Default_Bit_Order); -- kill constant condition warning + + -- Priority-related Declarations (RM D.1) + + -- Is the following actually true for GNU/Linux/m68k? + -- + -- 0 .. 98 corresponds to the system priority range 1 .. 99. + -- + -- If the scheduling policy is SCHED_FIFO or SCHED_RR the runtime makes use + -- of the entire range provided by the system. + -- + -- If the scheduling policy is SCHED_OTHER the only valid system priority + -- is 1 and other values are simply ignored. + + Max_Priority : constant Positive := 97; + Max_Interrupt_Priority : constant Positive := 98; + + subtype Any_Priority is Integer range 0 .. 98; + subtype Priority is Any_Priority range 0 .. 97; + subtype Interrupt_Priority is Any_Priority range 98 .. 98; + + Default_Priority : constant Priority := 48; + +private + + type Address is mod Memory_Size; + Null_Address : constant Address := 0; + + -------------------------------------- + -- System Implementation Parameters -- + -------------------------------------- + + -- These parameters provide information about the target that is used + -- by the compiler. They are in the private part of System, where they + -- can be accessed using the special circuitry in the Targparm unit + -- whose source should be consulted for more detailed descriptions + -- of the individual switch values. + + Backend_Divide_Checks : constant Boolean := False; + Backend_Overflow_Checks : constant Boolean := False; + Command_Line_Args : constant Boolean := True; + Configurable_Run_Time : constant Boolean := False; + Denorm : constant Boolean := True; + Duration_32_Bits : constant Boolean := False; + Exit_Status_Supported : constant Boolean := True; + Fractional_Fixed_Ops : constant Boolean := False; + Frontend_Layout : constant Boolean := False; + Machine_Overflows : constant Boolean := False; + Machine_Rounds : constant Boolean := True; + Preallocated_Stacks : constant Boolean := False; + Signed_Zeros : constant Boolean := True; + Stack_Check_Default : constant Boolean := False; + Stack_Check_Probes : constant Boolean := False; + Stack_Check_Limits : constant Boolean := False; + Support_Aggregates : constant Boolean := True; + Support_Atomic_Primitives : constant Boolean := True; + Support_Composite_Assign : constant Boolean := True; + Support_Composite_Compare : constant Boolean := True; + Support_Long_Shifts : constant Boolean := True; + Always_Compatible_Rep : constant Boolean := False; + Suppress_Standard_Library : constant Boolean := False; + Use_Ada_Main_Program_Name : constant Boolean := False; + Frontend_Exceptions : constant Boolean := False; + ZCX_By_Default : constant Boolean := True; + +end System; -- cgit v1.1 From d0ea9f0aa270f9791df42eb409e90c718575ad9a Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Mon, 5 Dec 2016 12:24:39 +0000 Subject: diagnostic.c (diagnostic_check_max_errors): New, broken out of ... gcc/ * diagnostic.c (diagnostic_check_max_errors): New, broken out of ... (diagnostic_action_after_output): ... here. (diagnostic_report_diagnostic): Call it for non-notes. * diagnostic.h (struct diagnostic_context): Make max_errors signed int. (diagnostic_check_max_errors): Declare. gcc/fortran/ * error.c (gfc_warning_check): Call diagnostic_check_max_errors. (gfc_error_check): Likewise. gcc/testsuite/ * c-c++-common/fmax_errors.c: Check notes after last error are emitted. From-SVN: r243254 --- gcc/ChangeLog | 9 +++++++ gcc/diagnostic.c | 40 ++++++++++++++++++++++---------- gcc/diagnostic.h | 3 ++- gcc/fortran/ChangeLog | 5 ++++ gcc/fortran/error.c | 2 ++ gcc/testsuite/ChangeLog | 5 ++++ gcc/testsuite/c-c++-common/fmax-errors.c | 14 +++++++++-- 7 files changed, 63 insertions(+), 15 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ad903f9..9488b0f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2016-12-05 Nathan Sidwell + + * diagnostic.c (diagnostic_check_max_errors): New, broken out of ... + (diagnostic_action_after_output): ... here. + (diagnostic_report_diagnostic): Call it for non-notes. + * diagnostic.h (struct diagnostic_context): Make max_errors signed + int. + (diagnostic_check_max_errors): Declare. + 2016-12-05 Cupertino Miranda * config/arc/arc.h (STARTFILE_SPEC): Use default linux specs. diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c index 4278a10..c06d266 100644 --- a/gcc/diagnostic.c +++ b/gcc/diagnostic.c @@ -446,6 +446,31 @@ bt_err_callback (void *data ATTRIBUTE_UNUSED, const char *msg, int errnum) errnum == 0 ? "" : xstrerror (errnum)); } +/* Check if we've met the maximum error limit, and if so fatally exit + with a message. CONTEXT is the context to check, and FLUSH + indicates whether a diagnostic_finish call is needed. */ + +void +diagnostic_check_max_errors (diagnostic_context *context, bool flush) +{ + if (!context->max_errors) + return; + + int count = (diagnostic_kind_count (context, DK_ERROR) + + diagnostic_kind_count (context, DK_SORRY) + + diagnostic_kind_count (context, DK_WERROR)); + + if (count >= context->max_errors) + { + fnotice (stderr, + "compilation terminated due to -fmax-errors=%u.\n", + context->max_errors); + if (flush) + diagnostic_finish (context); + exit (FATAL_EXIT_CODE); + } +} + /* Take any action which is expected to happen after the diagnostic is written out. This function does not always return. */ void @@ -470,18 +495,6 @@ diagnostic_action_after_output (diagnostic_context *context, diagnostic_finish (context); exit (FATAL_EXIT_CODE); } - if (context->max_errors != 0 - && ((unsigned) (diagnostic_kind_count (context, DK_ERROR) - + diagnostic_kind_count (context, DK_SORRY) - + diagnostic_kind_count (context, DK_WERROR)) - >= context->max_errors)) - { - fnotice (stderr, - "compilation terminated due to -fmax-errors=%u.\n", - context->max_errors); - diagnostic_finish (context); - exit (FATAL_EXIT_CODE); - } break; case DK_ICE: @@ -890,6 +903,9 @@ diagnostic_report_diagnostic (diagnostic_context *context, return false; } + if (diagnostic->kind != DK_NOTE) + diagnostic_check_max_errors (context); + context->lock++; if (diagnostic->kind == DK_ICE || diagnostic->kind == DK_ICE_NOBT) diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h index ead4d2a..f3bb494 100644 --- a/gcc/diagnostic.h +++ b/gcc/diagnostic.h @@ -143,7 +143,7 @@ struct diagnostic_context bool dc_warn_system_headers; /* Maximum number of errors to report. */ - unsigned int max_errors; + int max_errors; /* This function is called before any message is printed out. It is responsible for preparing message prefix and such. For example, it @@ -320,6 +320,7 @@ void default_diagnostic_start_span_fn (diagnostic_context *, void default_diagnostic_finalizer (diagnostic_context *, diagnostic_info *); void diagnostic_set_caret_max_width (diagnostic_context *context, int value); void diagnostic_action_after_output (diagnostic_context *, diagnostic_t); +void diagnostic_check_max_errors (diagnostic_context *, bool flush = false); void diagnostic_file_cache_fini (void); diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index 2c06b31..f1858ea 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,3 +1,8 @@ +2016-12-05 Nathan Sidwell + + * error.c (gfc_warning_check): Call diagnostic_check_max_errors. + (gfc_error_check): Likewise. + 2016-12-04 Janus Weil PR fortran/78618 diff --git a/gcc/fortran/error.c b/gcc/fortran/error.c index 0fd8a4e..757f7e2 100644 --- a/gcc/fortran/error.c +++ b/gcc/fortran/error.c @@ -1226,6 +1226,7 @@ gfc_warning_check (void) diagnostic_action_after_output (global_dc, warningcount_buffered ? DK_WARNING : DK_ERROR); + diagnostic_check_max_errors (global_dc, true); } } @@ -1370,6 +1371,7 @@ gfc_error_check (void) gcc_assert (gfc_output_buffer_empty_p (pp_error_buffer)); pp->buffer = tmp_buffer; diagnostic_action_after_output (global_dc, DK_ERROR); + diagnostic_check_max_errors (global_dc, true); return true; } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 2fe40d0..c40ffd6 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-05 Nathan Sidwell + + * c-c++-common/fmax_errors.c: Check notes after last error are + emitted. + 2016-12-04 James Greenhalgh PR rtl-optimization/78561 diff --git a/gcc/testsuite/c-c++-common/fmax-errors.c b/gcc/testsuite/c-c++-common/fmax-errors.c index 1ef78eb..b44e238 100644 --- a/gcc/testsuite/c-c++-common/fmax-errors.c +++ b/gcc/testsuite/c-c++-common/fmax-errors.c @@ -1,11 +1,21 @@ /* PR c/44782 */ /* { dg-do compile } */ -/* { dg-options "-fmax-errors=3" } */ +/* { dg-options "-fmax-errors=3 -Wall" } */ void foo (unsigned int i, unsigned int j) { (i) (); /* { dg-error "" } */ (j) (); /* { dg-error "" } */ - (i+j) (); /* { dg-error "" } */ + + i + j; /* { dg-warning "" } */ + + (k) (); /* { dg-error "" } */ + /* Make sure we see the notes related to the final error we emit. */ + /* { dg-message "identifier" "" { target c } 12 } */ + + /* Warnings after the final error should not appear. */ + i + j; /* no warning. */ + (i*j) (); /* no error here due to -fmax-errors */ + } /* { dg-prune-output "compilation terminated" } */ -- cgit v1.1 From 519e0faa00bda70c9ffd66b7e7a6011c5e742d2b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 5 Dec 2016 13:19:34 +0000 Subject: match.pd: Simplify X ? C : 0 where C is a power of 2 and X tests a single bit. gcc: * match.pd: Simplify X ? C : 0 where C is a power of 2 and X tests a single bit. gcc/testsuite: * gcc.dg/fold-and-lshift.c, gcc.dg/fold-and-rshift-1.c, gcc.dg/fold-and-rshift-2.c: New testcases. From-SVN: r243255 --- gcc/ChangeLog | 7 ++++++- gcc/match.pd | 28 +++++++++++++++++++++++++ gcc/testsuite/ChangeLog | 5 +++++ gcc/testsuite/gcc.dg/fold-and-lshift.c | 35 ++++++++++++++++++++++++++++++++ gcc/testsuite/gcc.dg/fold-and-rshift-1.c | 35 ++++++++++++++++++++++++++++++++ gcc/testsuite/gcc.dg/fold-and-rshift-2.c | 25 +++++++++++++++++++++++ 6 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.dg/fold-and-lshift.c create mode 100644 gcc/testsuite/gcc.dg/fold-and-rshift-1.c create mode 100644 gcc/testsuite/gcc.dg/fold-and-rshift-2.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 9488b0f..2ba0302 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2016-12-05 Paolo Bonzini + + * match.pd: Simplify X ? C : 0 where C is a power of 2 and + X tests a single bit. + 2016-12-05 Nathan Sidwell * diagnostic.c (diagnostic_check_max_errors): New, broken out of ... @@ -25,7 +30,7 @@ * config/arc/arc.md (ls_gd_load): Remove unused pattern. (tls_gd_dispatch): Likewise. -2016-12-025 Andre Vieira +2016-12-05 Andre Vieira * config/arm/arm.c (TARGET_ASM_INIT_SECTIONS): Fix wrong undef location. diff --git a/gcc/match.pd b/gcc/match.pd index dbb9103..1fe003b 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -2737,6 +2737,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (cmp (bit_and@2 @0 integer_pow2p@1) @1) (icmp @2 { build_zero_cst (TREE_TYPE (@0)); }))) +/* If we have (A & C) != 0 ? D : 0 where C and D are powers of 2, + convert this into a shift followed by ANDing with D. */ +(simplify + (cond + (ne (bit_and @0 integer_pow2p@1) integer_zerop) + integer_pow2p@2 integer_zerop) + (with { + int shift = wi::exact_log2 (@2) - wi::exact_log2 (@1); + } + (if (shift > 0) + (bit_and + (lshift (convert @0) { build_int_cst (integer_type_node, shift); }) @2) + (bit_and + (convert (rshift @0 { build_int_cst (integer_type_node, -shift); })) @2)))) + /* If we have (A & C) != 0 where C is the sign bit of A, convert this into A < 0. Similarly for (A & C) == 0 into A >= 0. */ (for cmp (eq ne) @@ -2751,6 +2766,19 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (with { tree stype = signed_type_for (TREE_TYPE (@0)); } (ncmp (convert:stype @0) { build_zero_cst (stype); }))))) +/* If we have A < 0 ? C : 0 where C is a power of 2, convert + this into a right shift followed by ANDing with C. */ +(simplify + (cond + (lt @0 integer_zerop) + integer_pow2p@1 integer_zerop) + (with { + int shift = element_precision (@0) - wi::exact_log2 (@1) - 1; + } + (bit_and + (convert (rshift @0 { build_int_cst (integer_type_node, shift); })) + @1))) + /* When the addresses are not directly of decls compare base and offset. This implements some remaining parts of fold_comparison address comparisons but still no complete part of it. Still it is good diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index c40ffd6..d9edb52 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-12-05 Paolo Bonzini + + * gcc.dg/fold-and-lshift.c, gcc.dg/fold-and-rshift-1.c, + gcc.dg/fold-and-rshift-2.c: New testcases. + 2016-12-05 Nathan Sidwell * c-c++-common/fmax_errors.c: Check notes after last error are diff --git a/gcc/testsuite/gcc.dg/fold-and-lshift.c b/gcc/testsuite/gcc.dg/fold-and-lshift.c new file mode 100644 index 0000000..2905095 --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-and-lshift.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fdump-tree-original" } */ + +int f(int x) +{ + return (x << 2) & 128; +} + +int g(int x) +{ + return !!(x & 32) << 7; +} + +int h(int x) +{ + return ((x >> 5) & 1) << 7; +} + +int i(int x) +{ + return (x & 32) >> 5 << 7; +} + +int j(int x) +{ + return ((x >> 5) & 1) ? 128 : 0; +} + +int k(int x) +{ + return (x & 32) ? 128 : 0; +} + +/* { dg-final { scan-tree-dump-not " \\? " "original" } } */ +/* { dg-final { scan-assembler-not "sarl" { target i?86-*-* x86_64-*-* } } }" */ diff --git a/gcc/testsuite/gcc.dg/fold-and-rshift-1.c b/gcc/testsuite/gcc.dg/fold-and-rshift-1.c new file mode 100644 index 0000000..11c13d5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-and-rshift-1.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fdump-tree-original" } */ + +int f(int x) +{ + return (x >> 2) & 128; +} + +int g(int x) +{ + return !!(x & 512) << 7; +} + +int h(int x) +{ + return ((x >> 9) & 1) << 7; +} + +int i(int x) +{ + return (x & 512) >> 9 << 7; +} + +int j(int x) +{ + return ((x >> 9) & 1) ? 128 : 0; +} + +int k(int x) +{ + return (x & 512) ? 128 : 0; +} + +/* { dg-final { scan-tree-dump-not " \\? " "original" } } */ +/* { dg-final { scan-assembler-not "sall" { target i?86-*-* x86_64-*-* } } }" */ diff --git a/gcc/testsuite/gcc.dg/fold-and-rshift-2.c b/gcc/testsuite/gcc.dg/fold-and-rshift-2.c new file mode 100644 index 0000000..f88d48d --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-and-rshift-2.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fdump-tree-original" } */ + +unsigned f(unsigned x) +{ + return (x >> 29) & 32; +} + +unsigned g(unsigned x) +{ + return !!(x & 0x80000000) << 5; +} + +unsigned j(unsigned x) +{ + return ((x >> 31) & 1) ? 32 : 0; +} + +unsigned k(unsigned x) +{ + return (x & 0x80000000) ? 32 : 0; +} + +/* { dg-final { scan-tree-dump-not " \\? " "original" } } */ +/* { dg-final { scan-assembler-not "sall" { target i?86-*-* x86_64-*-* } } }" */ -- cgit v1.1 From a80b4f579a10d01a6cfdfff37150cfccd134dc41 Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Mon, 5 Dec 2016 14:54:42 +0100 Subject: Subject: [PATCH] Revert "Do not simplify "(and (reg) (const bit)" to if_then_else." * combine.c: Revert r243162. From-SVN: r243256 --- gcc/ChangeLog | 4 ++++ gcc/combine.c | 12 ------------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 2ba0302..1ace8b0 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,7 @@ +2016-12-05 Segher Boessenkool + + * combine.c: Revert r243162. + 2016-12-05 Paolo Bonzini * match.pd: Simplify X ? C : 0 where C is a power of 2 and diff --git a/gcc/combine.c b/gcc/combine.c index 7ba634a..b429453 100644 --- a/gcc/combine.c +++ b/gcc/combine.c @@ -5602,18 +5602,6 @@ combine_simplify_rtx (rtx x, machine_mode op0_mode, int in_dest, && OBJECT_P (SUBREG_REG (XEXP (x, 0))))))) { rtx cond, true_rtx, false_rtx; - unsigned HOST_WIDE_INT nz; - - /* If the operation is an AND wrapped in a SIGN_EXTEND or ZERO_EXTEND with - either operand being just a constant single bit value, do nothing since - IF_THEN_ELSE is likely to increase the expression's complexity. */ - if (HWI_COMPUTABLE_MODE_P (mode) - && pow2p_hwi (nz = nonzero_bits (x, mode)) - && ! ((code == SIGN_EXTEND || code == ZERO_EXTEND) - && GET_CODE (XEXP (x, 0)) == AND - && CONST_INT_P (XEXP (XEXP (x, 0), 0)) - && UINTVAL (XEXP (XEXP (x, 0), 0)) == nz)) - return x; cond = if_then_else_cond (x, &true_rtx, &false_rtx); if (cond != 0 -- cgit v1.1 From 6901ea625b473fd0f13194bcaaf2a1f9ac458d62 Mon Sep 17 00:00:00 2001 From: Andrew Senkevich Date: Mon, 5 Dec 2016 17:18:42 +0000 Subject: Add AVX512 k-mask intrinsics gcc/ 2016-12-05 Andrew Senkevich * config/i386/avx512bwintrin.h: Add new k-mask intrinsics. * config/i386/avx512dqintrin.h: Ditto. * config/i386/avx512fintrin.h: Ditto. * config/i386/i386-builtin-types.def (UCHAR_FTYPE_UQI_UQI_PUCHAR, UCHAR_FTYPE_UHI_UHI_PUCHAR, UCHAR_FTYPE_USI_USI_PUCHAR, UCHAR_FTYPE_UDI_UDI_PUCHAR, UCHAR_FTYPE_UQI_UQI, UCHAR_FTYPE_UHI_UHI, UCHAR_FTYPE_USI_USI, UCHAR_FTYPE_UDI_UDI, UQI_FTYPE_UQI_INT, UHI_FTYPE_UHI_INT, USI_FTYPE_USI_INT, UDI_FTYPE_UDI_INT, UQI_FTYPE_UQI, USI_FTYPE_USI, UDI_FTYPE_UDI, UQI_FTYPE_UQI_UQI): New function types. * config/i386/i386-builtin.def (__builtin_ia32_knotqi, __builtin_ia32_knotsi, __builtin_ia32_knotdi, __builtin_ia32_korqi, __builtin_ia32_korsi, __builtin_ia32_kordi, __builtin_ia32_kxnorqi, __builtin_ia32_kxnorsi, __builtin_ia32_kxnordi, __builtin_ia32_kxorqi, __builtin_ia32_kxorsi, __builtin_ia32_kxordi, __builtin_ia32_kandqi, __builtin_ia32_kandsi, __builtin_ia32_kanddi, __builtin_ia32_kandnqi, __builtin_ia32_kandnsi, __builtin_ia32_kandndi): New. * config/i386/i386.c (ix86_expand_args_builtin): Handle new types. gcc/testsuite/ 2016-12-05 Andrew Senkevich * gcc.target/i386/avx512bw-kandd-1.c: New. * gcc.target/i386/avx512bw-kandnd-1.c: Ditto. * gcc.target/i386/avx512bw-kandnq-1.c: Ditto. * gcc.target/i386/avx512bw-kandq-1.c: Ditto. * gcc.target/i386/avx512bw-knotd-1.c: Ditto. * gcc.target/i386/avx512bw-knotq-1.c: Ditto. * gcc.target/i386/avx512bw-kord-1.c: Ditto. * gcc.target/i386/avx512bw-korq-1.c: Ditto. * gcc.target/i386/avx512bw-kunpckdq-3.c: Ditto. * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto. * gcc.target/i386/avx512bw-kxnord-1.c: Ditto. * gcc.target/i386/avx512bw-kxnorq-1.c: Ditto. * gcc.target/i386/avx512bw-kxord-1.c: Ditto. * gcc.target/i386/avx512bw-kxorq-1.c: Ditto. * gcc.target/i386/avx512dq-kandb-1.c: Ditto. * gcc.target/i386/avx512dq-kandnb-1.c: Ditto. * gcc.target/i386/avx512dq-knotb-1.c: Ditto. * gcc.target/i386/avx512dq-korb-1.c: Ditto. * gcc.target/i386/avx512dq-kxnorb-1.c: Ditto. * gcc.target/i386/avx512dq-kxorb-1.c: Ditto. * gcc.target/i386/avx512f-kunpckbw-3.c: Ditto. * gcc.target/i386/avx512f-kandnw-1.c: Removed unneeded check. From-SVN: r243265 --- gcc/ChangeLog | 22 +++++ gcc/config/i386/avx512bwintrin.h | 100 +++++++++++++++++++++ gcc/config/i386/avx512dqintrin.h | 42 +++++++++ gcc/config/i386/avx512fintrin.h | 17 +++- gcc/config/i386/i386-builtin-types.def | 28 ++++++ gcc/config/i386/i386-builtin.def | 18 ++++ gcc/config/i386/i386.c | 10 +++ gcc/testsuite/ChangeLog | 25 ++++++ gcc/testsuite/gcc.target/i386/avx512bw-kandd-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512bw-kandnd-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512bw-kandnq-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512bw-kandq-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512bw-knotd-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512bw-knotq-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512bw-kord-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512bw-korq-1.c | 18 ++++ .../gcc.target/i386/avx512bw-kunpckdq-3.c | 16 ++++ .../gcc.target/i386/avx512bw-kunpckwd-3.c | 16 ++++ gcc/testsuite/gcc.target/i386/avx512bw-kxnord-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512bw-kxnorq-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512bw-kxord-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512bw-kxorq-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512dq-kandb-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512dq-kandnb-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512dq-knotb-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512dq-korb-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512dq-kxnorb-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512dq-kxorb-1.c | 18 ++++ gcc/testsuite/gcc.target/i386/avx512f-kandnw-1.c | 1 - gcc/testsuite/gcc.target/i386/avx512f-kunpckbw-3.c | 18 ++++ 30 files changed, 635 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kandd-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kandnd-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kandnq-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kandq-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-knotd-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-knotq-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kord-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-korq-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kunpckdq-3.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kxnord-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kxnorq-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kxord-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-kxorq-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-kandb-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-kandnb-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-knotb-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-korb-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-kxnorb-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-kxorb-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-kunpckbw-3.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1ace8b0..02d560d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,25 @@ +2016-12-05 Andrew Senkevich + + * config/i386/avx512bwintrin.h: Add new k-mask intrinsics. + * config/i386/avx512dqintrin.h: Ditto. + * config/i386/avx512fintrin.h: Ditto. + * config/i386/i386-builtin-types.def (UCHAR_FTYPE_UQI_UQI_PUCHAR, + UCHAR_FTYPE_UHI_UHI_PUCHAR, UCHAR_FTYPE_USI_USI_PUCHAR, + UCHAR_FTYPE_UDI_UDI_PUCHAR, UCHAR_FTYPE_UQI_UQI, UCHAR_FTYPE_UHI_UHI, + UCHAR_FTYPE_USI_USI, UCHAR_FTYPE_UDI_UDI, UQI_FTYPE_UQI_INT, + UHI_FTYPE_UHI_INT, USI_FTYPE_USI_INT, UDI_FTYPE_UDI_INT, + UQI_FTYPE_UQI, USI_FTYPE_USI, UDI_FTYPE_UDI, UQI_FTYPE_UQI_UQI): New + function types. + * config/i386/i386-builtin.def (__builtin_ia32_knotqi, + __builtin_ia32_knotsi, __builtin_ia32_knotdi, + __builtin_ia32_korqi, __builtin_ia32_korsi, __builtin_ia32_kordi, + __builtin_ia32_kxnorqi, __builtin_ia32_kxnorsi, + __builtin_ia32_kxnordi, __builtin_ia32_kxorqi, __builtin_ia32_kxorsi, + __builtin_ia32_kxordi, __builtin_ia32_kandqi, + __builtin_ia32_kandsi, __builtin_ia32_kanddi, __builtin_ia32_kandnqi, + __builtin_ia32_kandnsi, __builtin_ia32_kandndi): New. + * config/i386/i386.c (ix86_expand_args_builtin): Handle new types. + 2016-12-05 Segher Boessenkool * combine.c: Revert r243162. diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h index 4069802..9e6e0ce 100644 --- a/gcc/config/i386/avx512bwintrin.h +++ b/gcc/config/i386/avx512bwintrin.h @@ -40,6 +40,90 @@ typedef char __v64qi __attribute__ ((__vector_size__ (64))); typedef unsigned long long __mmask64; +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask32 (__mmask32 __A) +{ + return (__mmask32) __builtin_ia32_knotsi ((__mmask32) __A); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask64 (__mmask64 __A) +{ + return (__mmask64) __builtin_ia32_knotdi ((__mmask64) __A); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_korsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kordi ((__mmask64) __A, (__mmask64) __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kxnorsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kxnordi ((__mmask64) __A, (__mmask64) __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kxorsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kxordi ((__mmask64) __A, (__mmask64) __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kandsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kanddi ((__mmask64) __A, (__mmask64) __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kandnsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kandndi ((__mmask64) __A, (__mmask64) __B); +} + extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) @@ -114,6 +198,14 @@ _mm512_kunpackw (__mmask32 __A, __mmask32 __B) (__mmask32) __B); } +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackw_mask32 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, + (__mmask32) __B); +} + extern __inline __mmask64 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_kunpackd (__mmask64 __A, __mmask64 __B) @@ -122,6 +214,14 @@ _mm512_kunpackd (__mmask64 __A, __mmask64 __B) (__mmask64) __B); } +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackd_mask64 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, + (__mmask64) __B); +} + extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h index 4b954f9..d2405c3 100644 --- a/gcc/config/i386/avx512dqintrin.h +++ b/gcc/config/i386/avx512dqintrin.h @@ -34,6 +34,48 @@ #define __DISABLE_AVX512DQ__ #endif /* __AVX512DQ__ */ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask8 (__mmask8 __A) +{ + return (__mmask8) __builtin_ia32_knotqi ((__mmask8) __A); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_korqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kxnorqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kxorqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kandqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B); +} + extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_broadcast_f64x2 (__m128d __A) diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h index 2372c83..ab1704b 100644 --- a/gcc/config/i386/avx512fintrin.h +++ b/gcc/config/i386/avx512fintrin.h @@ -9977,6 +9977,13 @@ _mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P) } /* Mask arithmetic operations */ +#define _kand_mask16 _mm512_kand +#define _kandn_mask16 _mm512_kandn +#define _knot_mask16 _mm512_knot +#define _kor_mask16 _mm512_kor +#define _kxnor_mask16 _mm512_kxnor +#define _kxor_mask16 _mm512_kxor + extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_kand (__mmask16 __A, __mmask16 __B) @@ -9988,7 +9995,8 @@ extern __inline __mmask16 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_kandn (__mmask16 __A, __mmask16 __B) { - return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); + return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, + (__mmask16) __B); } extern __inline __mmask16 @@ -10042,6 +10050,13 @@ _mm512_kunpackb (__mmask16 __A, __mmask16 __B) return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackb_mask16 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); +} + #ifdef __OPTIMIZE__ extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 4a38c12..6e938eb 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -139,6 +139,12 @@ DEF_POINTER_TYPE (PLONGLONG, LONGLONG) DEF_POINTER_TYPE (PULONGLONG, ULONGLONG) DEF_POINTER_TYPE (PUNSIGNED, UNSIGNED) +DEF_POINTER_TYPE (PUQI, UQI) +DEF_POINTER_TYPE (PUHI, UHI) +DEF_POINTER_TYPE (PUSI, USI) +DEF_POINTER_TYPE (PUDI, UDI) +DEF_POINTER_TYPE (PUCHAR, UCHAR) + DEF_POINTER_TYPE (PV2SI, V2SI) DEF_POINTER_TYPE (PV2DF, V2DF) DEF_POINTER_TYPE (PV2DI, V2DI) @@ -536,7 +542,28 @@ DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI, V16SI, V16SI, PCV4SI) # Instructions returning mask +DEF_FUNCTION_TYPE (UCHAR, UQI, UQI, PUCHAR) +DEF_FUNCTION_TYPE (UCHAR, UQI, UQI) +DEF_FUNCTION_TYPE (UCHAR, UHI, UHI, PUCHAR) +DEF_FUNCTION_TYPE (UCHAR, UHI, UHI) +DEF_FUNCTION_TYPE (UCHAR, USI, USI, PUCHAR) +DEF_FUNCTION_TYPE (UCHAR, USI, USI) +DEF_FUNCTION_TYPE (UCHAR, UDI, UDI, PUCHAR) +DEF_FUNCTION_TYPE (UCHAR, UDI, UDI) + +DEF_FUNCTION_TYPE (USI, UQI) +DEF_FUNCTION_TYPE (USI, UHI) +DEF_FUNCTION_TYPE (UQI, USI) +DEF_FUNCTION_TYPE (UHI, USI) + +DEF_FUNCTION_TYPE (UQI, UQI, INT) +DEF_FUNCTION_TYPE (UHI, UHI, INT) +DEF_FUNCTION_TYPE (USI, USI, INT) +DEF_FUNCTION_TYPE (UDI, UDI, INT) +DEF_FUNCTION_TYPE (UQI, UQI) DEF_FUNCTION_TYPE (UHI, UHI) +DEF_FUNCTION_TYPE (USI, USI) +DEF_FUNCTION_TYPE (UDI, UDI) DEF_FUNCTION_TYPE (UHI, V16QI) DEF_FUNCTION_TYPE (USI, V32QI) DEF_FUNCTION_TYPE (UDI, V64QI) @@ -549,6 +576,7 @@ DEF_FUNCTION_TYPE (UHI, V16SI) DEF_FUNCTION_TYPE (UQI, V2DI) DEF_FUNCTION_TYPE (UQI, V4DI) DEF_FUNCTION_TYPE (UQI, V8DI) +DEF_FUNCTION_TYPE (UQI, UQI, UQI) DEF_FUNCTION_TYPE (UHI, UHI, UHI) DEF_FUNCTION_TYPE (USI, USI, USI) DEF_FUNCTION_TYPE (UDI, UDI, UDI) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index a9c272a..83a5089 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -1436,15 +1436,33 @@ BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd_vec_pack_sfix512, "__bu BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd_vec_pack_sfix512, "__builtin_ia32_ceilpd_vec_pack_sfix512", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512, (enum rtx_code) ROUND_CEIL, (int) V16SI_FTYPE_V8DF_V8DF_ROUND) /* Mask arithmetic operations */ +BDESC (OPTION_MASK_ISA_AVX512DQ, CODE_FOR_kandqi, "__builtin_ia32_kandqi", IX86_BUILTIN_KAND8, UNKNOWN, (int) UQI_FTYPE_UQI_UQI) BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_kandhi, "__builtin_ia32_kandhi", IX86_BUILTIN_KAND16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_kandsi, "__builtin_ia32_kandsi", IX86_BUILTIN_KAND32, UNKNOWN, (int) USI_FTYPE_USI_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_kanddi, "__builtin_ia32_kanddi", IX86_BUILTIN_KAND64, UNKNOWN, (int) UDI_FTYPE_UDI_UDI) +BDESC (OPTION_MASK_ISA_AVX512DQ, CODE_FOR_kandnqi, "__builtin_ia32_kandnqi", IX86_BUILTIN_KANDN8, UNKNOWN, (int) UQI_FTYPE_UQI_UQI) BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_kandnhi, "__builtin_ia32_kandnhi", IX86_BUILTIN_KANDN16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_kandnsi, "__builtin_ia32_kandnsi", IX86_BUILTIN_KANDN32, UNKNOWN, (int) USI_FTYPE_USI_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_kandndi, "__builtin_ia32_kandndi", IX86_BUILTIN_KANDN64, UNKNOWN, (int) UDI_FTYPE_UDI_UDI) +BDESC (OPTION_MASK_ISA_AVX512DQ, CODE_FOR_knotqi, "__builtin_ia32_knotqi", IX86_BUILTIN_KNOT8, UNKNOWN, (int) UQI_FTYPE_UQI) BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_knothi, "__builtin_ia32_knothi", IX86_BUILTIN_KNOT16, UNKNOWN, (int) UHI_FTYPE_UHI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_knotsi, "__builtin_ia32_knotsi", IX86_BUILTIN_KNOT32, UNKNOWN, (int) USI_FTYPE_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_knotdi, "__builtin_ia32_knotdi", IX86_BUILTIN_KNOT64, UNKNOWN, (int) UDI_FTYPE_UDI) +BDESC (OPTION_MASK_ISA_AVX512DQ, CODE_FOR_kiorqi, "__builtin_ia32_korqi", IX86_BUILTIN_KOR8, UNKNOWN, (int) UQI_FTYPE_UQI_UQI) BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_kiorhi, "__builtin_ia32_korhi", IX86_BUILTIN_KOR16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_kiorsi, "__builtin_ia32_korsi", IX86_BUILTIN_KOR32, UNKNOWN, (int) USI_FTYPE_USI_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_kiordi, "__builtin_ia32_kordi", IX86_BUILTIN_KOR64, UNKNOWN, (int) UDI_FTYPE_UDI_UDI) BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_kortestchi, "__builtin_ia32_kortestchi", IX86_BUILTIN_KORTESTC16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI) BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_kortestzhi, "__builtin_ia32_kortestzhi", IX86_BUILTIN_KORTESTZ16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI) BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_kunpckhi, "__builtin_ia32_kunpckhi", IX86_BUILTIN_KUNPCKBW, UNKNOWN, (int) UHI_FTYPE_UHI_UHI) +BDESC (OPTION_MASK_ISA_AVX512DQ, CODE_FOR_kxnorqi, "__builtin_ia32_kxnorqi", IX86_BUILTIN_KXNOR8, UNKNOWN, (int) UQI_FTYPE_UQI_UQI) BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_kxnorhi, "__builtin_ia32_kxnorhi", IX86_BUILTIN_KXNOR16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_kxnorsi, "__builtin_ia32_kxnorsi", IX86_BUILTIN_KXNOR32, UNKNOWN, (int) USI_FTYPE_USI_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_kxnordi, "__builtin_ia32_kxnordi", IX86_BUILTIN_KXNOR64, UNKNOWN, (int) UDI_FTYPE_UDI_UDI) +BDESC (OPTION_MASK_ISA_AVX512DQ, CODE_FOR_kxorqi, "__builtin_ia32_kxorqi", IX86_BUILTIN_KXOR8, UNKNOWN, (int) UQI_FTYPE_UQI_UQI) BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_kxorhi, "__builtin_ia32_kxorhi", IX86_BUILTIN_KXOR16, UNKNOWN, (int) UHI_FTYPE_UHI_UHI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_kxorsi, "__builtin_ia32_kxorsi", IX86_BUILTIN_KXOR32, UNKNOWN, (int) USI_FTYPE_USI_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, CODE_FOR_kxordi, "__builtin_ia32_kxordi", IX86_BUILTIN_KXOR64, UNKNOWN, (int) UDI_FTYPE_UDI_UDI) BDESC (OPTION_MASK_ISA_AVX512F, CODE_FOR_kmovw, "__builtin_ia32_kmov16", IX86_BUILTIN_KMOV16, UNKNOWN, (int) UHI_FTYPE_UHI) /* SHA */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 41717da..003439f 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -34842,7 +34842,12 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V4DI_FTYPE_V8HI: case V4DI_FTYPE_V4SI: case V4DI_FTYPE_V2DI: + case UQI_FTYPE_UQI: case UHI_FTYPE_UHI: + case USI_FTYPE_USI: + case USI_FTYPE_UQI: + case USI_FTYPE_UHI: + case UDI_FTYPE_UDI: case UHI_FTYPE_V16QI: case USI_FTYPE_V32QI: case UDI_FTYPE_V64QI: @@ -34976,6 +34981,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case UINT_FTYPE_UINT_UCHAR: case UINT16_FTYPE_UINT16_INT: case UINT8_FTYPE_UINT8_INT: + case UQI_FTYPE_UQI_UQI: case UHI_FTYPE_UHI_UHI: case USI_FTYPE_USI_USI: case UDI_FTYPE_UDI_UDI: @@ -35023,6 +35029,10 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V4DI_FTYPE_V8DI_INT: case QI_FTYPE_V4SF_INT: case QI_FTYPE_V2DF_INT: + case UQI_FTYPE_UQI_INT: + case UHI_FTYPE_UHI_INT: + case USI_FTYPE_USI_INT: + case UDI_FTYPE_UDI_INT: nargs = 2; nargs_constant = 1; break; diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index d9edb52..3b0a8fa 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,28 @@ +2016-12-05 Andrew Senkevich + + * gcc.target/i386/avx512bw-kandd-1.c: New. + * gcc.target/i386/avx512bw-kandnd-1.c: Ditto. + * gcc.target/i386/avx512bw-kandnq-1.c: Ditto. + * gcc.target/i386/avx512bw-kandq-1.c: Ditto. + * gcc.target/i386/avx512bw-knotd-1.c: Ditto. + * gcc.target/i386/avx512bw-knotq-1.c: Ditto. + * gcc.target/i386/avx512bw-kord-1.c: Ditto. + * gcc.target/i386/avx512bw-korq-1.c: Ditto. + * gcc.target/i386/avx512bw-kunpckdq-3.c: Ditto. + * gcc.target/i386/avx512bw-kunpckwd-3.c: Ditto. + * gcc.target/i386/avx512bw-kxnord-1.c: Ditto. + * gcc.target/i386/avx512bw-kxnorq-1.c: Ditto. + * gcc.target/i386/avx512bw-kxord-1.c: Ditto. + * gcc.target/i386/avx512bw-kxorq-1.c: Ditto. + * gcc.target/i386/avx512dq-kandb-1.c: Ditto. + * gcc.target/i386/avx512dq-kandnb-1.c: Ditto. + * gcc.target/i386/avx512dq-knotb-1.c: Ditto. + * gcc.target/i386/avx512dq-korb-1.c: Ditto. + * gcc.target/i386/avx512dq-kxnorb-1.c: Ditto. + * gcc.target/i386/avx512dq-kxorb-1.c: Ditto. + * gcc.target/i386/avx512f-kunpckbw-3.c: Ditto. + * gcc.target/i386/avx512f-kandnw-1.c: Removed unneeded check. + 2016-12-05 Paolo Bonzini * gcc.dg/fold-and-lshift.c, gcc.dg/fold-and-rshift-1.c, diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kandd-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kandd-1.c new file mode 100644 index 0000000..2a934f5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kandd-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kandd\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask32 k1, k2, k3; + volatile __m512i x = _mm512_setzero_epi32(); + + __asm__( "kmovd %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovd %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kand_mask32 (k1, k2); + x = _mm512_mask_add_epi16 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kandnd-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kandnd-1.c new file mode 100644 index 0000000..69cbe04 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kandnd-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kandnd\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask32 k1, k2, k3; + volatile __m512i x = _mm512_setzero_si512 (); + + __asm__( "kmovd %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovd %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kandn_mask32 (k1, k2); + x = _mm512_mask_add_epi16 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kandnq-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kandnq-1.c new file mode 100644 index 0000000..e8b7a5f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kandnq-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kandnq\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask64 k1, k2, k3; + volatile __m512i x = _mm512_setzero_si512 (); + + __asm__( "kmovq %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovq %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kandn_mask64 (k1, k2); + x = _mm512_mask_add_epi8 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kandq-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kandq-1.c new file mode 100644 index 0000000..a1aaed6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kandq-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kandq\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask64 k1, k2, k3; + volatile __m512i x = _mm512_setzero_epi32(); + + __asm__( "kmovq %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovq %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kand_mask64 (k1, k2); + x = _mm512_mask_add_epi8 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-knotd-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-knotd-1.c new file mode 100644 index 0000000..8a7e033 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-knotd-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "knotd\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask32 k1, k2; + volatile __m512i x = _mm512_setzero_si512 (); + + __asm__( "kmovd %1, %0" : "=k" (k1) : "r" (45) ); + + k2 = _knot_mask32 (k1); + x = _mm512_mask_add_epi16 (x, k1, x, x); + x = _mm512_mask_add_epi16 (x, k2, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-knotq-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-knotq-1.c new file mode 100644 index 0000000..deb6579 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-knotq-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "knotq\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask64 k1, k2; + volatile __m512i x = _mm512_setzero_si512 (); + + __asm__( "kmovq %1, %0" : "=k" (k1) : "r" (45) ); + + k2 = _knot_mask64 (k1); + x = _mm512_mask_add_epi8 (x, k1, x, x); + x = _mm512_mask_add_epi8 (x, k2, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kord-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kord-1.c new file mode 100644 index 0000000..4c35a81 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kord-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kord\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask32 k1, k2, k3; + volatile __m512i x = _mm512_setzero_si512 (); + + __asm__( "kmovd %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovd %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kor_mask32 (k1, k2); + x = _mm512_mask_add_epi16 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-korq-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-korq-1.c new file mode 100644 index 0000000..89753f0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-korq-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "korq\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask64 k1, k2, k3; + volatile __m512i x = _mm512_setzero_si512 (); + + __asm__( "kmovq %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovq %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kor_mask64 (k1, k2); + x = _mm512_mask_add_epi8 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckdq-3.c b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckdq-3.c new file mode 100644 index 0000000..951260f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckdq-3.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kunpckdq\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () { + volatile __mmask64 k3; + __mmask32 k1, k2; + + __asm__( "kmovd %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovd %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kunpackd_mask64 (k1, k2); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c new file mode 100644 index 0000000..c68ad8c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kunpckwd-3.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kunpckwd\[ \\t\]+\[^\{\n\]*%k\[1-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () { + volatile __mmask32 k3; + __mmask16 k1, k2; + + __asm__( "kmovw %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovw %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kunpackw_mask32 (k1, k2); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kxnord-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kxnord-1.c new file mode 100644 index 0000000..d93d61e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kxnord-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kxnord\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask32 k1, k2, k3; + volatile __m512i x = _mm512_setzero_si512 (); + + __asm__( "kmovd %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovd %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kxnor_mask32 (k1, k2); + x = _mm512_mask_add_epi16 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kxnorq-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kxnorq-1.c new file mode 100644 index 0000000..ba72e1f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kxnorq-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kxnorq\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask64 k1, k2, k3; + volatile __m512i x = _mm512_setzero_si512 (); + + __asm__( "kmovq %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovq %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kxnor_mask64 (k1, k2); + x = _mm512_mask_add_epi8 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kxord-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kxord-1.c new file mode 100644 index 0000000..97ea291 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kxord-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kxord\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask32 k1, k2, k3; + volatile __m512i x = _mm512_setzero_si512 (); + + __asm__( "kmovd %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovd %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kxor_mask32 (k1, k2); + x = _mm512_mask_add_epi16 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-kxorq-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-kxorq-1.c new file mode 100644 index 0000000..abf4280 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-kxorq-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "kxorq\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512bw_test () +{ + __mmask64 k1, k2, k3; + volatile __m512i x = _mm512_setzero_si512 (); + + __asm__( "kmovq %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovq %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kxor_mask64 (k1, k2); + x = _mm512_mask_add_epi8 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kandb-1.c b/gcc/testsuite/gcc.target/i386/avx512dq-kandb-1.c new file mode 100644 index 0000000..b5b5367 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-kandb-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512dq -O2" } */ +/* { dg-final { scan-assembler-times "kandb\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512dq_test () +{ + __mmask8 k1, k2, k3; + volatile __m512i x = _mm512_setzero_epi32(); + + __asm__( "kmovb %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovb %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kand_mask8 (k1, k2); + x = _mm512_mask_add_epi64 (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kandnb-1.c b/gcc/testsuite/gcc.target/i386/avx512dq-kandnb-1.c new file mode 100644 index 0000000..a0e96fd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-kandnb-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512dq -O2" } */ +/* { dg-final { scan-assembler-times "kandnb\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512dq_test () +{ + __mmask8 k1, k2, k3; + volatile __m512d x = _mm512_setzero_pd(); + + __asm__( "kmovb %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovb %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kandn_mask8 (k1, k2); + x = _mm512_mask_add_pd (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-knotb-1.c b/gcc/testsuite/gcc.target/i386/avx512dq-knotb-1.c new file mode 100644 index 0000000..03bbf83 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-knotb-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512dq -O2" } */ +/* { dg-final { scan-assembler-times "knotb\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512dq_test () +{ + __mmask8 k1, k2; + volatile __m512d x = _mm512_setzero_pd(); + + __asm__( "kmovb %1, %0" : "=k" (k1) : "r" (45) ); + + k2 = _knot_mask8 (k1); + x = _mm512_mask_add_pd (x, k1, x, x); + x = _mm512_mask_add_pd (x, k2, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-korb-1.c b/gcc/testsuite/gcc.target/i386/avx512dq-korb-1.c new file mode 100644 index 0000000..7717aee --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-korb-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512dq -O2" } */ +/* { dg-final { scan-assembler-times "korb\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512dq_test () +{ + __mmask8 k1, k2, k3; + volatile __m512d x = _mm512_setzero_pd(); + + __asm__( "kmovb %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovb %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kor_mask8 (k1, k2); + x = _mm512_mask_add_pd (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kxnorb-1.c b/gcc/testsuite/gcc.target/i386/avx512dq-kxnorb-1.c new file mode 100644 index 0000000..faa974f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-kxnorb-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512dq -O2" } */ +/* { dg-final { scan-assembler-times "kxnorb\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512dq_test () +{ + __mmask8 k1, k2, k3; + volatile __m512d x = _mm512_setzero_pd(); + + __asm__( "kmovb %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovb %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kxnor_mask8 (k1, k2); + x = _mm512_mask_add_pd (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-kxorb-1.c b/gcc/testsuite/gcc.target/i386/avx512dq-kxorb-1.c new file mode 100644 index 0000000..a21830b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-kxorb-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512dq -O2" } */ +/* { dg-final { scan-assembler-times "kxorb\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512dq_test () +{ + __mmask8 k1, k2, k3; + volatile __m512d x = _mm512_setzero_pd(); + + __asm__( "kmovb %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovb %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kxor_mask8 (k1, k2); + x = _mm512_mask_add_pd (x, k3, x, x); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kandnw-1.c b/gcc/testsuite/gcc.target/i386/avx512f-kandnw-1.c index 727a589..17b7b29 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-kandnw-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-kandnw-1.c @@ -1,7 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-mavx512f -O2" } */ /* { dg-final { scan-assembler-times "kandnw\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "kmovw" 2 } } */ #include diff --git a/gcc/testsuite/gcc.target/i386/avx512f-kunpckbw-3.c b/gcc/testsuite/gcc.target/i386/avx512f-kunpckbw-3.c new file mode 100644 index 0000000..2061f0a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-kunpckbw-3.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -O2" } */ +/* { dg-final { scan-assembler-times "kunpckbw\[ \\t\]+\[^\{\n\]*%k\[0-7\](?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +void +avx512f_test () { + __mmask8 k1, k2; + __mmask16 k3; + volatile __m512 x = _mm512_setzero_ps(); + + __asm__( "kmovb %1, %0" : "=k" (k1) : "r" (1) ); + __asm__( "kmovb %1, %0" : "=k" (k2) : "r" (2) ); + + k3 = _kunpackb_mask16 (k1, k2); + x = _mm512_mask_add_ps (x, k3, x, x); +} -- cgit v1.1 From 09955a32593b5ac9aedf40d1a0a9d715e840b5a3 Mon Sep 17 00:00:00 2001 From: Waldemar Brodkorb Date: Mon, 5 Dec 2016 17:48:39 +0000 Subject: * config.gcc (*-*-uclinux*): Enable posix threads. From-SVN: r243268 --- gcc/ChangeLog | 4 ++++ gcc/config.gcc | 3 +++ 2 files changed, 7 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 02d560d..793f710 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,7 @@ +2016-12-05 Waldemar Brodkorb + + * config.gcc (*-*-uclinux*): Enable posix threads. + 2016-12-05 Andrew Senkevich * config/i386/avx512bwintrin.h: Add new k-mask intrinsics. diff --git a/gcc/config.gcc b/gcc/config.gcc index e034bc3..d11e579 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -831,6 +831,9 @@ case ${target} in *-*-uclinux*) extra_options="$extra_options gnu-user.opt" use_gcc_stdint=wrap + case ${enable_threads} in + "" | yes | posix) thread_file='posix' ;; + esac tm_defines="$tm_defines DEFAULT_LIBC=LIBC_UCLIBC SINGLE_LIBC" ;; *-*-rdos*) -- cgit v1.1 From e3cc0f6bb67b012c26d3e7a5e2e76a90456c4d16 Mon Sep 17 00:00:00 2001 From: Jeff Law Date: Mon, 5 Dec 2016 10:49:41 -0700 Subject: re PR target/71721 (uclinux posix threads) PR target/71721 * config.gcc (*-*-uclinux*): Enable posix threads. Adding BZ marker From-SVN: r243269 --- gcc/ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 793f710..57c2450 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,6 @@ 2016-12-05 Waldemar Brodkorb + PR target/71721 * config.gcc (*-*-uclinux*): Enable posix threads. 2016-12-05 Andrew Senkevich -- cgit v1.1 From 8907a722fdfbe6992b4c5ee7ee5fa939213154af Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Mon, 5 Dec 2016 21:48:27 +0000 Subject: re PR tree-optimization/78646 (incorrect result type for pointer addition in slsr) 2016-12-05 Bill Schmidt Stefan Freudenberger PR tree-optimization/78646 * gimple-ssa-strength-reduction.c (replace_ref): The pointer addition used for the memory base expression should have the type of the candidate. Co-Authored-By: Stefan Freudenberger From-SVN: r243272 --- gcc/ChangeLog | 8 ++++++++ gcc/gimple-ssa-strength-reduction.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 57c2450..beef921 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2016-12-05 Bill Schmidt + Stefan Freudenberger + + PR tree-optimization/78646 + * gimple-ssa-strength-reduction.c (replace_ref): The pointer + addition used for the memory base expression should have the type + of the candidate. + 2016-12-05 Waldemar Brodkorb PR target/71721 diff --git a/gcc/gimple-ssa-strength-reduction.c b/gcc/gimple-ssa-strength-reduction.c index bdfdb9a..21dcbb0 100644 --- a/gcc/gimple-ssa-strength-reduction.c +++ b/gcc/gimple-ssa-strength-reduction.c @@ -1921,7 +1921,7 @@ replace_ref (tree *expr, slsr_cand_t c) if (align < TYPE_ALIGN (acc_type)) acc_type = build_aligned_type (acc_type, align); - add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (c->base_expr), + add_expr = fold_build2 (POINTER_PLUS_EXPR, c->cand_type, c->base_expr, c->stride); mem_ref = fold_build2 (MEM_REF, acc_type, add_expr, wide_int_to_tree (c->cand_type, c->index)); -- cgit v1.1 From 51d20f21607bc03375f687424a8c11c66dd37416 Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Mon, 5 Dec 2016 22:49:31 +0000 Subject: * es.po, fr.po: Update. From-SVN: r243273 --- gcc/po/ChangeLog | 4 + gcc/po/es.po | 278 +++----------- gcc/po/fr.po | 1133 ++++++++++++++++++++++-------------------------------- 3 files changed, 524 insertions(+), 891 deletions(-) diff --git a/gcc/po/ChangeLog b/gcc/po/ChangeLog index 4b6ca4b..fa76569 100644 --- a/gcc/po/ChangeLog +++ b/gcc/po/ChangeLog @@ -1,3 +1,7 @@ +2016-12-05 Joseph Myers + + * es.po, fr.po: Update. + 2016-12-01 Joseph Myers * es.po: Update. diff --git a/gcc/po/es.po b/gcc/po/es.po index cf04cb3..4172026 100644 --- a/gcc/po/es.po +++ b/gcc/po/es.po @@ -35,7 +35,7 @@ msgstr "" "Project-Id-Version: gcc 6.2.0\n" "Report-Msgid-Bugs-To: http://gcc.gnu.org/bugs.html\n" "POT-Creation-Date: 2016-08-19 21:03+0000\n" -"PO-Revision-Date: 2016-12-01 23:28+0100\n" +"PO-Revision-Date: 2016-12-04 14:54+0100\n" "Last-Translator: Antonio Ceballos \n" "Language-Team: Spanish \n" "Language: es\n" @@ -8972,42 +8972,30 @@ msgid "Known code models (for use with the -mcmodel= option):" msgstr "Modelos de código conocidos (para uso con la opción -mcmodel=):" #: config/i386/i386.opt:296 -#, fuzzy -#| msgid "Use complex addressing modes" msgid "Use given address mode." -msgstr "Usar modos de direccionamiento complejos." +msgstr "Usar el modo de dirección dado." #: config/i386/i386.opt:300 -#, fuzzy -#| msgid "Known code models (for use with the -mcmodel= option):" msgid "Known address mode (for use with the -maddress-mode= option):" -msgstr "Modelos de código conocidos (para uso con la opción -mcmodel=):" +msgstr "Modo de dirección conocido (para uso con la opción -maddress-mode=):" #: config/i386/i386.opt:309 msgid "%<-mcpu=%> is deprecated; use %<-mtune=%> or %<-march=%> instead" msgstr "%<-mcpu=%> es obsoleto; utilice %<-mtune=%> o %<-march%> en su lugar" #: config/i386/i386.opt:313 -#, fuzzy -#| msgid "Generate sin, cos, sqrt for FPU" msgid "Generate sin, cos, sqrt for FPU." msgstr "Genera sin, cos, sqrt para FPU." #: config/i386/i386.opt:317 -#, fuzzy -#| msgid "Always use Dynamic Realigned Argument Pointer (DRAP) to realign stack" msgid "Always use Dynamic Realigned Argument Pointer (DRAP) to realign stack." msgstr "Usar siempre el Puntero de Argumento Realineado Dinámicamente (DRAP) para realinear la pila." #: config/i386/i386.opt:321 -#, fuzzy -#| msgid "Return values of functions in FPU registers" msgid "Return values of functions in FPU registers." msgstr "Devuelve los valores de funciones en registros FPU." #: config/i386/i386.opt:325 -#, fuzzy -#| msgid "Generate floating point mathematics using given instruction set" msgid "Generate floating point mathematics using given instruction set." msgstr "Genera matemáticas de coma flotante usando el conjunto de instrucciones dado." @@ -9016,14 +9004,10 @@ msgid "Valid arguments to -mfpmath=:" msgstr "Argumentos válidos para -mfpmath=:" #: config/i386/i386.opt:362 -#, fuzzy -#| msgid "Inline all known string operations" msgid "Inline all known string operations." msgstr "Incluye en línea todas las operaciones de cadenas conocidas." #: config/i386/i386.opt:366 -#, fuzzy -#| msgid "Inline memset/memcpy string operations, but perform inline version only for small blocks" msgid "Inline memset/memcpy string operations, but perform inline version only for small blocks." msgstr "Incluye en línea las operaciones de cadena memset/memcpy, pero realiza la versión inline sólo para los bloques pequeños." @@ -9032,100 +9016,70 @@ msgid "%<-mintel-syntax%> and %<-mno-intel-syntax%> are deprecated; use %<-masm= msgstr "%<-mintel-syntax%> y %<-mno-intel-syntax%> son obsoletos; utilice %<-masm=intel%> y %<-masm=att%> en su lugar" #: config/i386/i386.opt:374 -#, fuzzy -#| msgid "Use native (MS) bitfield layout" msgid "Use native (MS) bitfield layout." msgstr "Usa la disposición de campos de bits nativos (MS)." #: config/i386/i386.opt:394 -#, fuzzy -#| msgid "Set 80387 floating-point precision to 32-bit" msgid "Set 80387 floating-point precision to 32-bit." msgstr "Establece la precisión de coma flotante 80387 a 32-bit." #: config/i386/i386.opt:398 -#, fuzzy -#| msgid "Set 80387 floating-point precision to 64-bit" msgid "Set 80387 floating-point precision to 64-bit." msgstr "Establece la precisión de coma flotante 80387 a 64-bit." #: config/i386/i386.opt:402 -#, fuzzy -#| msgid "Set 80387 floating-point precision to 80-bit" msgid "Set 80387 floating-point precision to 80-bit." msgstr "Establece la precisión de coma flotante 80387 a 80-bit." #: config/i386/i386.opt:406 -#, fuzzy -#| msgid "Attempt to keep stack aligned to this power of 2" msgid "Attempt to keep stack aligned to this power of 2." msgstr "Trata de mantenter la pila alineada a esta potencia de 2." #: config/i386/i386.opt:410 -#, fuzzy -#| msgid "Assume incoming stack aligned to this power of 2" msgid "Assume incoming stack aligned to this power of 2." msgstr "Asume que la pila de entrada está alineada a esta potencia de 2." #: config/i386/i386.opt:414 -#, fuzzy -#| msgid "Use push instructions to save outgoing arguments" msgid "Use push instructions to save outgoing arguments." msgstr "Usa instrucciones push para guardar los argumentos de salida." #: config/i386/i386.opt:418 -#, fuzzy -#| msgid "Use red-zone in the x86-64 code" msgid "Use red-zone in the x86-64 code." msgstr "Usa la zona roja en el código x86-64." #: config/i386/i386.opt:422 -#, fuzzy -#| msgid "Number of registers used to pass integer arguments" msgid "Number of registers used to pass integer arguments." msgstr "Número de registros usados para pasar argumentos enteros." #: config/i386/i386.opt:426 -#, fuzzy -#| msgid "Alternate calling convention" msgid "Alternate calling convention." -msgstr "Convención de llamada alternativa." +msgstr "Convenio de llamada alternativa." #: config/i386/i386.opt:430 config/alpha/alpha.opt:23 -#, fuzzy -#| msgid "Do not use hardware fp" msgid "Do not use hardware fp." msgstr "No usa fp de hardware." #: config/i386/i386.opt:434 -#, fuzzy -#| msgid "Use SSE register passing conventions for SF and DF mode" msgid "Use SSE register passing conventions for SF and DF mode." -msgstr "Usa las convenciones de paso de registro SSE para los modos SF y DF." +msgstr "Usa los convenios de paso de registro SSE para los modos SF y DF." #: config/i386/i386.opt:438 -#, fuzzy -#| msgid "Realign stack in prologue" msgid "Realign stack in prologue." msgstr "Realinea la pila en el prólogo." #: config/i386/i386.opt:442 -#, fuzzy -#| msgid "Enable stack probing" msgid "Enable stack probing." msgstr "Habilita la prueba de la pila." #: config/i386/i386.opt:446 msgid "Specify memcpy expansion strategy when expected size is known." -msgstr "" +msgstr "Especifica la estrategia de expansión de memcpy cuando se conoce el tamaño esperado." #: config/i386/i386.opt:450 msgid "Specify memset expansion strategy when expected size is known." -msgstr "" +msgstr "Especifica la estrategia de expansión de memset cuando se conoce el tamaño esperado." #: config/i386/i386.opt:454 -#, fuzzy -#| msgid "Chose strategy to generate stringop using" msgid "Chose strategy to generate stringop using." msgstr "Escoge la estrategia para generar stringop using." @@ -9134,8 +9088,6 @@ msgid "Valid arguments to -mstringop-strategy=:" msgstr "Argumentos válidos para -mstringop-strategy=:" #: config/i386/i386.opt:486 -#, fuzzy -#| msgid "Use given thread-local storage dialect" msgid "Use given thread-local storage dialect." msgstr "Usa el dialecto de almacenamiento thread-local dado." @@ -9144,30 +9096,23 @@ msgid "Known TLS dialects (for use with the -mtls-dialect= option):" msgstr "Dialectos TLS conocidos (para usar con la opción -mtls-dialect=):" #: config/i386/i386.opt:500 -#, fuzzy, c-format -#| msgid "Use direct references against %gs when accessing tls data" +#, c-format msgid "Use direct references against %gs when accessing tls data." -msgstr "Usa referencias directas contra %gs cuando se accesen datos tls." +msgstr "Usa referencias directas contra %gs cuando se acceden datos tls." #: config/i386/i386.opt:508 msgid "Fine grain control of tune features." -msgstr "" +msgstr "Control fino de las características de ajuste." #: config/i386/i386.opt:512 -#, fuzzy -#| msgid "Allow all ugly features" msgid "Clear all tune features." -msgstr "Desactiva todas las características feas." +msgstr "Quita todas las características de ajuste." #: config/i386/i386.opt:519 -#, fuzzy -#| msgid "Generate code that conforms to the given ABI" msgid "Generate code that conforms to Intel MCU psABI." -msgstr "Genera código que cumpla con la ABI dada." +msgstr "Genera código que cumpla con Intel MCU psABI." #: config/i386/i386.opt:523 -#, fuzzy -#| msgid "Generate code that conforms to the given ABI" msgid "Generate code that conforms to the given ABI." msgstr "Genera código que cumpla con la ABI dada." @@ -9176,8 +9121,6 @@ msgid "Known ABIs (for use with the -mabi= option):" msgstr "ABIs conocidas (para usar con la opción -mabi=):" #: config/i386/i386.opt:537 config/rs6000/rs6000.opt:189 -#, fuzzy -#| msgid "Vector library ABI to use" msgid "Vector library ABI to use." msgstr "ABI de biblioteca de vectores a utilizar." @@ -9186,8 +9129,6 @@ msgid "Known vectorization library ABIs (for use with the -mveclibabi= option):" msgstr "ABIs de biblioteca de vectorización conocidas (para usar con la opción -mveclibabi=):" #: config/i386/i386.opt:551 -#, fuzzy -#| msgid "Return 8-byte vectors in memory" msgid "Return 8-byte vectors in memory." msgstr "Devuelve vectores de 8 bytes en memoria." @@ -9207,101 +9148,72 @@ msgstr "Genera la instrucción cld en el prólogo de función." msgid "Generate vzeroupper instruction before a transfer of control flow out of" msgstr "Genera la instrucción vzeroupper antes de una transferencia de flujo de control fuera de" +# TODO review #: config/i386/i386.opt:572 msgid "Disable Scalar to Vector optimization pass transforming 64-bit integer" -msgstr "" +msgstr "Desactiva el paso de optimización de escalar a vector al transformar enteros de 64 bits" #: config/i386/i386.opt:577 -#, fuzzy -#| msgid "Do dispatch scheduling if processor is bdver1 or bdver2 and Haifa scheduling" msgid "Do dispatch scheduling if processor is bdver1, bdver2, bdver3, bdver4" -msgstr "Despacha al planificador si el procesador es bdver1 o bdver2 y la planificación es Haifa" +msgstr "Despacha al planificador si el procesador es bdver1, bdver2, bdver3, bdver4" #: config/i386/i386.opt:582 msgid "Use 128-bit AVX instructions instead of 256-bit AVX instructions in the auto-vectorizer." msgstr "Usa instrucciones AVX de 128-bit en lugar de instrucciones AVX de 256-bit en el auto-vectorizador." #: config/i386/i386.opt:588 -#, fuzzy -#| msgid "Generate 32bit i386 code" msgid "Generate 32bit i386 code." msgstr "Genera código i386 de 32bit." #: config/i386/i386.opt:592 -#, fuzzy -#| msgid "Generate 64bit x86-64 code" msgid "Generate 64bit x86-64 code." msgstr "Genera código x86-64 de 64bit." #: config/i386/i386.opt:596 -#, fuzzy -#| msgid "Generate 32bit x86-64 code" msgid "Generate 32bit x86-64 code." msgstr "Genera código x86-64 de 32bit." #: config/i386/i386.opt:600 -#, fuzzy -#| msgid "Generate 32bit i386 code" msgid "Generate 16bit i386 code." -msgstr "Genera código i386 de 32bit." +msgstr "Genera código i386 de 16bit." #: config/i386/i386.opt:604 -#, fuzzy -#| msgid "Support MMX built-in functions" msgid "Support MMX built-in functions." msgstr "Admite funciones internas MMX." #: config/i386/i386.opt:608 -#, fuzzy -#| msgid "Support 3DNow! built-in functions" msgid "Support 3DNow! built-in functions." msgstr "Admite funciones internas 3DNow!." #: config/i386/i386.opt:612 -#, fuzzy -#| msgid "Support Athlon 3Dnow! built-in functions" msgid "Support Athlon 3Dnow! built-in functions." msgstr "Admite funciones internas Athlon 3DNow!." #: config/i386/i386.opt:616 -#, fuzzy -#| msgid "Support MMX and SSE built-in functions and code generation" msgid "Support MMX and SSE built-in functions and code generation." msgstr "Admite funciones internas y generación de código MMX y SSE." #: config/i386/i386.opt:620 -#, fuzzy -#| msgid "Support MMX, SSE and SSE2 built-in functions and code generation" msgid "Support MMX, SSE and SSE2 built-in functions and code generation." msgstr "Admite funciones internas y generación de código MMX, SSE y SSE2." #: config/i386/i386.opt:624 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2 and SSE3 built-in functions and code generation" msgid "Support MMX, SSE, SSE2 and SSE3 built-in functions and code generation." msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2 y SSE3." #: config/i386/i386.opt:628 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation." msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3 y SSSE3." #: config/i386/i386.opt:632 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3 and SSE4.1 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3 and SSE4.1 built-in functions and code generation." msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3 y SSE4.1." #: config/i386/i386.opt:636 config/i386/i386.opt:640 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1 and SSE4.2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1 and SSE4.2 built-in functions and code generation." msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1 y SSE4.2." #: config/i386/i386.opt:644 -#, fuzzy -#| msgid "Do not support SSE4.1 and SSE4.2 built-in functions and code generation" msgid "Do not support SSE4.1 and SSE4.2 built-in functions and code generation." msgstr "No admite funciones internas y generación de código SSE4.1 y SSE4.2." @@ -9310,100 +9222,68 @@ msgid "%<-msse5%> was removed" msgstr "se eliminó %<-msse5%>" #: config/i386/i386.opt:652 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AVX built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AVX built-in functions and code generation." msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 y AVX." #: config/i386/i386.opt:656 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation." msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y AVX2." #: config/i386/i386.opt:660 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y AVX2." +msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 y AVX512F." #: config/i386/i386.opt:664 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512PF built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y AVX2." +msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F y AVX512PF." #: config/i386/i386.opt:668 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512ER built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y AVX2." +msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F y AVX512ER." #: config/i386/i386.opt:672 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512CD built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y AVX2." +msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F y AVX512CD." #: config/i386/i386.opt:676 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512DQ built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y AVX2." +msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F y AVX512DQ." #: config/i386/i386.opt:680 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512BW built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y AVX2." +msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F y AVX512BW." #: config/i386/i386.opt:684 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512VL built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y AVX2." +msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F y AVX512VL." #: config/i386/i386.opt:688 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512IFMA built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y AVX2." +msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F y AVX512IFMA." #: config/i386/i386.opt:692 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512VBMI built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y AVX2." +msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F y AVX512VBMI." #: config/i386/i386.opt:696 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and FMA built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and FMA built-in functions and code generation." msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX y FMA." #: config/i386/i386.opt:700 -#, fuzzy -#| msgid "Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation" msgid "Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation." msgstr "Admite funciones internas y generación de código MMX, SSE, SSE2, SSE3 y SSE4A." #: config/i386/i386.opt:704 -#, fuzzy -#| msgid "Support FMA4 built-in functions and code generation " msgid "Support FMA4 built-in functions and code generation." -msgstr "Admite funciones internas FMA4 y generación de código ." +msgstr "Admite funciones internas FMA4 y generación de código." #: config/i386/i386.opt:708 -#, fuzzy -#| msgid "Support XOP built-in functions and code generation " msgid "Support XOP built-in functions and code generation." -msgstr "Admite funciones internas XOP y generación de código ." +msgstr "Admite funciones internas XOP y generación de código." #: config/i386/i386.opt:712 -#, fuzzy -#| msgid "Support LWP built-in functions and code generation " msgid "Support LWP built-in functions and code generation." -msgstr "Admite funciones internas LWP y generación de código ." +msgstr "Admite funciones internas LWP y generación de código." #: config/i386/i386.opt:716 msgid "Support code generation of Advanced Bit Manipulation (ABM) instructions." @@ -9414,36 +9294,28 @@ msgid "Support code generation of popcnt instruction." msgstr "Admite la generación de código de la instrucción popcnt." #: config/i386/i386.opt:724 -#, fuzzy -#| msgid "Support BMI built-in functions and code generation" msgid "Support BMI built-in functions and code generation." msgstr "Admite funciones internas y generación de código BMI." #: config/i386/i386.opt:728 -#, fuzzy -#| msgid "Support BMI2 built-in functions and code generation" msgid "Support BMI2 built-in functions and code generation." msgstr "Admite funciones internas y generación de código BMI2." #: config/i386/i386.opt:732 -#, fuzzy -#| msgid "Support LZCNT built-in function and code generation" msgid "Support LZCNT built-in function and code generation." msgstr "Admite funciones internas y generación de código LZCNT." #: config/i386/i386.opt:736 msgid "Support Hardware Lock Elision prefixes." -msgstr "" +msgstr "Admite prefijos de elisión de bloqueo de hardware." #: config/i386/i386.opt:740 -#, fuzzy -#| msgid "no support for induction" msgid "Support RDSEED instruction." -msgstr "no se admite la inducción." +msgstr "Admite la instrucción RDSEED." #: config/i386/i386.opt:744 msgid "Support PREFETCHW instruction." -msgstr "" +msgstr "Admite la instrucción PREFETCHW." #: config/i386/i386.opt:748 #, fuzzy @@ -9453,49 +9325,37 @@ msgstr "Admite la generación de código de la instrucción crc32." #: config/i386/i386.opt:752 msgid "Support CLFLUSHOPT instructions." -msgstr "" +msgstr "Admite las instrucciones CLFLUSHOPT." #: config/i386/i386.opt:756 -#, fuzzy -#| msgid "no support for induction" msgid "Support CLWB instruction." -msgstr "no se admite la inducción." +msgstr "Admite la instrucción CLWB.." #: config/i386/i386.opt:760 -#, fuzzy -#| msgid "Support MMX built-in functions" msgid "Support PCOMMIT instruction." -msgstr "Admite funciones internas MMX." +msgstr "Admite la instrucción PCOMMIT." #: config/i386/i386.opt:764 msgid "Support FXSAVE and FXRSTOR instructions." -msgstr "" +msgstr "Admite las instrucciones PXSAVE y FXRSTOR." #: config/i386/i386.opt:768 -#, fuzzy -#| msgid "Support code generation of movbe instruction." msgid "Support XSAVE and XRSTOR instructions." -msgstr "Admite la generación de código de la instrucción movbe." +msgstr "Admite las instrucciones XSAV y XRSTOR." #: config/i386/i386.opt:772 -#, fuzzy -#| msgid "Support MMX built-in functions" msgid "Support XSAVEOPT instruction." -msgstr "Admite funciones internas MMX." +msgstr "Admite las instrucciones XSAVEOPT." #: config/i386/i386.opt:776 -#, fuzzy -#| msgid "Support MMX built-in functions" msgid "Support XSAVEC instructions." -msgstr "Admite funciones internas MMX." +msgstr "Admite las instrucciones XSAVEC." #: config/i386/i386.opt:780 msgid "Support XSAVES and XRSTORS instructions." -msgstr "" +msgstr "Admite las instrucciones XSAVES y XRSTORS." #: config/i386/i386.opt:784 -#, fuzzy -#| msgid "Support TBM built-in functions and code generation" msgid "Support TBM built-in functions and code generation." msgstr "Admite funciones internas y generación de código TBM." @@ -9516,52 +9376,36 @@ msgid "Support code generation of crc32 instruction." msgstr "Admite la generación de código de la instrucción crc32." #: config/i386/i386.opt:804 -#, fuzzy -#| msgid "Support AES built-in functions and code generation" msgid "Support AES built-in functions and code generation." msgstr "Admite funciones internas y generación de código AES." #: config/i386/i386.opt:808 -#, fuzzy -#| msgid "Support MMX and SSE built-in functions and code generation" msgid "Support SHA1 and SHA256 built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX y SSE." +msgstr "Admite funciones internas y generación de código SHA1 y SHA256." #: config/i386/i386.opt:812 -#, fuzzy -#| msgid "Support PCLMUL built-in functions and code generation" msgid "Support PCLMUL built-in functions and code generation." msgstr "Admite funciones internas y generación de código PCLMUL." #: config/i386/i386.opt:816 -#, fuzzy -#| msgid "Encode SSE instructions with VEX prefix" msgid "Encode SSE instructions with VEX prefix." msgstr "Codifica las instrucciones SSE con el prefijo VEX." #: config/i386/i386.opt:820 -#, fuzzy -#| msgid "Support FSGSBASE built-in functions and code generation" msgid "Support FSGSBASE built-in functions and code generation." msgstr "Admite funciones internas y generación de código FSGSBASE." #: config/i386/i386.opt:824 -#, fuzzy -#| msgid "Support RDRND built-in functions and code generation" msgid "Support RDRND built-in functions and code generation." msgstr "Admite funciones internas y generación de código RDRND." #: config/i386/i386.opt:828 -#, fuzzy -#| msgid "Support F16C built-in functions and code generation" msgid "Support F16C built-in functions and code generation." msgstr "Admite funciones internas y generación de código F16C." #: config/i386/i386.opt:832 -#, fuzzy -#| msgid "Support F16C built-in functions and code generation" msgid "Support PREFETCHWT1 built-in functions and code generation." -msgstr "Admite funciones internas y generación de código F16C." +msgstr "Admite funciones internas y generación de código PREFETCHWT1." #: config/i386/i386.opt:836 msgid "Emit profiling counter call at function entry before prologue." @@ -9569,77 +9413,59 @@ msgstr "Emite llamada a contador de perfil en la entrada de función antes del p #: config/i386/i386.opt:840 msgid "Generate __mcount_loc section with all mcount or __fentry__ calls." -msgstr "" +msgstr "Genera sección __mcount_loc con todos los mcount o llamadas __fentry__." #: config/i386/i386.opt:844 msgid "Generate mcount/__fentry__ calls as nops. To activate they need to be" -msgstr "" +msgstr "Genera llamadas mcount/__fentry__ como nops. Para activarlo han de ser" #: config/i386/i386.opt:849 msgid "Skip setting up RAX register when passing variable arguments." -msgstr "" +msgstr "Salta configurar registro RAX cuando se pasan argumentos variables." #: config/i386/i386.opt:853 -#, fuzzy -#| msgid "Expand 32bit/64bit integer divide into 8bit unsigned integer divide with run-time check" msgid "Expand 32bit/64bit integer divide into 8bit unsigned integer divide with run-time check." msgstr "Expande la división entera de 32bit/64bit en división entera sin signo de 8bit con revisión en tiempo de ejecución." #: config/i386/i386.opt:857 -#, fuzzy -#| msgid "Split 32-byte AVX unaligned load" msgid "Split 32-byte AVX unaligned load." msgstr "Divide load sin alinear AVX de 32-byte." #: config/i386/i386.opt:861 -#, fuzzy -#| msgid "Split 32-byte AVX unaligned store" msgid "Split 32-byte AVX unaligned store." msgstr "Divide store sin alinear AVX de 32-byte." #: config/i386/i386.opt:865 -#, fuzzy -#| msgid "Support TBM built-in functions and code generation" msgid "Support RTM built-in functions and code generation." -msgstr "Admite funciones internas y generación de código TBM." +msgstr "Admite funciones internas y generación de código RTM." #: config/i386/i386.opt:869 -#, fuzzy -#| msgid "Support code generation of movbe instruction." msgid "Support MPX code generation." -msgstr "Admite la generación de código de la instrucción movbe." +msgstr "Admite la generación de código MPX." #: config/i386/i386.opt:873 -#, fuzzy -#| msgid "Support MMX and SSE built-in functions and code generation" msgid "Support MWAITX and MONITORX built-in functions and code generation." -msgstr "Admite funciones internas y generación de código MMX y SSE." +msgstr "Admite funciones internas y generación de código MWAITX y MONITORX." #: config/i386/i386.opt:877 -#, fuzzy -#| msgid "Support AES built-in functions and code generation" msgid "Support CLZERO built-in functions and code generation." -msgstr "Admite funciones internas y generación de código AES." +msgstr "Admite funciones internas y generación de código CLZERO." #: config/i386/i386.opt:881 -#, fuzzy -#| msgid "Support PCLMUL built-in functions and code generation" msgid "Support PKU built-in functions and code generation." -msgstr "Admite funciones internas y generación de código PCLMUL." +msgstr "Admite funciones internas y generación de código PKU." #: config/i386/i386.opt:885 msgid "Use given stack-protector guard." -msgstr "" +msgstr "USa la guarda de protección de la pila dada." #: config/i386/i386.opt:889 -#, fuzzy -#| msgid "Known ARM architectures (for use with the -march= option):" msgid "Known stack protector guard (for use with the -mstack-protector-guard= option):" -msgstr "Arquitecturas ARM conocidas (para usar con la opción -march=):" +msgstr "Guarda de protección de pila conocida (para usar con la opción -mstack-protector-guard=):" #: config/i386/i386.opt:899 msgid "Attempt to avoid generating instruction sequences containing ret bytes." -msgstr "" +msgstr "Trata de evitar que se generen secuencias de instrucciones que contengan ret bytes." #: config/i386/stringop.opt:8 msgid "the Free Software Foundation; either version 3, or (at your option)" diff --git a/gcc/po/fr.po b/gcc/po/fr.po index 6b7518c..96a8789 100644 --- a/gcc/po/fr.po +++ b/gcc/po/fr.po @@ -133,7 +133,7 @@ msgstr "" "Project-Id-Version: gcc 6.2.0\n" "Report-Msgid-Bugs-To: http://gcc.gnu.org/bugs.html\n" "POT-Creation-Date: 2016-08-19 21:03+0000\n" -"PO-Revision-Date: 2016-11-27 16:58+0100\n" +"PO-Revision-Date: 2016-12-03 14:29+0100\n" "Last-Translator: Frédéric Marchal \n" "Language-Team: French \n" "Language: fr\n" @@ -2284,336 +2284,327 @@ msgstr "Le nombre de registres de chaque classe laissés inutilisés par les dé #: params.def:814 #, no-c-format msgid "The max number of reload pseudos which are considered during spilling a non-reload pseudo." -msgstr "Le nombre maximum de pseudos qui peuvent être rechargés qui doivent être considérés pendant le versage de pseudos qui ne peuvent pas être rechargés." +msgstr "Le nombre maximum de pseudos rechargeables qui doivent être considérés pendant le versage de pseudos qui ne peuvent pas être rechargés." #: params.def:819 #, no-c-format msgid "Minimal fall-through edge probability in percentage used to add BB to inheritance EBB in LRA." -msgstr "" +msgstr "La probabilité minimale, en pourcents, qu'une arrête enchaîne immédiatement sur une autre pour décider d'ajouter un BB à un EBB d'héritage dans LRA." #: params.def:827 #, no-c-format msgid "The maximum ratio between array size and switch branches for a switch conversion to take place." -msgstr "" +msgstr "La rapport maximum entre la taille du tableau et les branchements du switch pour qu'une conversion du switch ait lieu." #: params.def:835 #, no-c-format msgid "size of tiles for loop blocking." -msgstr "" +msgstr "la taille des blocs pour le découpage de boucle en blocs (loop blocking)." #: params.def:842 #, no-c-format msgid "maximum number of parameters in a SCoP." -msgstr "Le nombre maximum de paramètres dans un scope." +msgstr "le nombre maximum de paramètres dans un SCoP." #: params.def:849 #, no-c-format msgid "maximum number of basic blocks per function to be analyzed by Graphite." -msgstr "" +msgstr "le nombre maximum de blocs de base par fonction à analyser par Graphite." #: params.def:856 #, no-c-format msgid "maximum number of arrays per scop." -msgstr "Le nombre maximum de tableaux par scope." +msgstr "le nombre maximum de tableaux par scop." #: params.def:863 #, no-c-format msgid "minimal number of loops per function to be analyzed by Graphite." -msgstr "" +msgstr "le nombre minimum de boucles par fonction à analyser par Graphite." #: params.def:868 #, no-c-format msgid "maximum number of isl operations, 0 means unlimited" -msgstr "Le nombre maximum d'opérations isl, 0 pour illimité" +msgstr "le nombre maximum d'opérations isl, 0 pour illimité" #: params.def:874 #, no-c-format msgid "Maximum number of datarefs in loop for building loop data dependencies." -msgstr "" +msgstr "Le nombre maximum de datarefs dans une boucle pour construire les dépendances de données de la boucle." #: params.def:881 #, no-c-format msgid "Max basic blocks number in loop for loop invariant motion." -msgstr "" +msgstr "Le nombre max de blocs de base dans une boucle pour un déplacement de boucle invariant." #: params.def:889 -#, fuzzy, no-c-format -#| msgid "cannot find file for class %s" +#, no-c-format msgid "use internal function id in profile lookup." -msgstr "Ne peut repérer le fichier pour la classe %s." +msgstr "utiliser un ID de fonction interne dans la recherche de profilage." #: params.def:897 #, no-c-format msgid "track topn target addresses in indirect-call profile." -msgstr "" +msgstr "pister les N adresses cibles les plus fréquentes dans le profilage d'appels indirects." #: params.def:903 -#, fuzzy, no-c-format -#| msgid "The maximum number of instructions in a single function eligible for inlining" +#, no-c-format msgid "Maximum number of instructions in basic block to be considered for SLP vectorization." -msgstr "Le nombre maximum d'instructions dans une fonction simple éligible au type enligne" +msgstr "Le nombre maximum d'instructions dans un bloc de base à considérer pour la vectorisation SLP." #: params.def:908 #, no-c-format msgid "Min. ratio of insns to prefetches to enable prefetching for a loop with an unknown trip count." -msgstr "" +msgstr "Le rapport min entre les insns et les pré-extractions pour activer la pré-extraction dans une boucle avec un nombre inconnu d'itérations." #: params.def:914 #, no-c-format msgid "Min. ratio of insns to mem ops to enable prefetching in a loop." -msgstr "" +msgstr "Le rapport min entre les insns et les opérandes mémoire pour activer la pré-extraction dans une boucle." #: params.def:921 #, no-c-format msgid "Max. size of var tracking hash tables." -msgstr "" +msgstr "La taille max des tables de hachage pour le pistage des variables." #: params.def:929 #, no-c-format msgid "Max. recursion depth for expanding var tracking expressions." -msgstr "" +msgstr "La profondeur de récursion max pour développer les expressions de pistage de variables." #: params.def:937 #, no-c-format msgid "Max. size of loc list for which reverse ops should be added." -msgstr "" +msgstr "La taille max de la liste des emplacements pour laquelle des opérations inverses devraient être ajoutées." #: params.def:944 #, no-c-format msgid "The minimum UID to be used for a nondebug insn." -msgstr "" +msgstr "Le UID minimum à utiliser pour une insn qui n'est pas pour le déboguage." #: params.def:949 #, no-c-format msgid "Maximum allowed growth of size of new parameters ipa-sra replaces a pointer to an aggregate with." -msgstr "" +msgstr "La croissance maximale de la taille autorisée quand IPA-SRA remplace un pointeur vers un agrégat par de nouveaux paramètres." #: params.def:955 #, no-c-format msgid "Size in bytes after which thread-local aggregates should be instrumented with the logging functions instead of save/restore pairs." -msgstr "" +msgstr "La taille en octets au delà de laquelle des agrégats locaux au thread doivent être manipulés par les fonctions de journalisation au lieu de paires de save/restore." #: params.def:962 #, no-c-format msgid "Maximum size, in storage units, of an aggregate which should be considered for scalarization when compiling for speed." -msgstr "" +msgstr "La taille maximale, en unité de stockage, d'un agrégat qui devrait être considéré pour une conversion scalaire lors d'une compilation pour la vitesse." #: params.def:968 #, no-c-format msgid "Maximum size, in storage units, of an aggregate which should be considered for scalarization when compiling for size." -msgstr "" +msgstr "La taille maximale, en unité de stockage, d'un agrégat qui devrait être considéré pour une conversion scalaire lors d'une compilation pour l'espace." #: params.def:974 #, no-c-format msgid "Maximum size of a list of values associated with each parameter for interprocedural constant propagation." -msgstr "" +msgstr "La taille maximale d'une liste de valeurs associées à chaque paramètre pour la propagation de constantes interprocédurale." #: params.def:980 #, no-c-format msgid "Threshold ipa-cp opportunity evaluation that is still considered beneficial to clone.." -msgstr "" +msgstr "Le seuil de l'évaluation de l'opportunité de IPA-CP qui est encore considérée comme bénéfique au clone." #: params.def:986 #, no-c-format msgid "Percentage penalty the recursive functions will receive when they are evaluated for cloning.." -msgstr "" +msgstr "Le pourcentage de pénalité que des fonctions récursives recevront quand elles seront évaluées pour un clonage." #: params.def:992 #, no-c-format msgid "Percentage penalty functions containg a single call to another function will receive when they are evaluated for cloning.." -msgstr "" +msgstr "Le pourcentage de pénalité que des fonctions contenant un unique appel à une autre fonction recevront quand elles seront évaluées pour un clonage." #: params.def:998 #, no-c-format msgid "Maximum number of aggregate content items for a parameter in jump functions and lattices." -msgstr "" +msgstr "Le nombre maximum d'éléments contenus dans un agrégat d'un paramètre dans des fonctions de saut ou des maillages." #: params.def:1004 #, no-c-format msgid "Compile-time bonus IPA-CP assigns to candidates which make loop bounds or strides known.." -msgstr "" +msgstr "Bonus à la compilation que IPA-CP assigne aux candidats qui rendent les limites ou les pas des boucles connus." #: params.def:1010 #, no-c-format msgid "Compile-time bonus IPA-CP assigns to candidates which make an array index known.." -msgstr "" +msgstr "Bonus à la compilation que IPA-CP assigne aux candidats qui rendent connus l'index dans un tableau." #: params.def:1016 #, no-c-format msgid "Maximum number of statements that will be visited by IPA formal parameter analysis based on alias analysis in any given function." -msgstr "" +msgstr "Le nombre maximum d'expressions qui seront visitées par l'analyse des paramètres formels de IPA basé sur l'analyse d'alias dans une fonction quelconque." #: params.def:1024 #, no-c-format msgid "Number of partitions the program should be split to." -msgstr "" +msgstr "Le nombre de partitions dans lesquelles il faudrait scinder le programme." #: params.def:1029 #, no-c-format msgid "Minimal size of a partition for LTO (in estimated instructions)." -msgstr "" +msgstr "La taille minimale d'une partition pour LTO (en instructions estimées)." #: params.def:1036 -#, fuzzy, no-c-format -#| msgid "The maximum number of instructions to consider to fill a delay slot" +#, no-c-format msgid "Maximum number of namespaces to search for alternatives when name lookup fails." -msgstr "Le nombre maximum d'instructions à considérer pour remplir une slot délai" +msgstr "Le nombre maximum d'espaces de noms à parcourir pour chercher des alternatives quand la recherche de nom a échoué." #: params.def:1043 #, no-c-format msgid "Maximum number of conditional store pairs that can be sunk." -msgstr "" +msgstr "Le nombre maximum de paires de stockage conditionnel qui peuvent être descendues." #: params.def:1051 #, no-c-format msgid "The smallest number of different values for which it is best to use a jump-table instead of a tree of conditional branches, if 0, use the default for the machine." -msgstr "" +msgstr "Le plus petit nombre de valeurs différentes pour lequel il est préférable d'utiliser une table de sauts plutôt qu'un arbre de branchements conditionnels. Si 0, utilise le choix par défaut pour la machine." #: params.def:1059 #, no-c-format msgid "Allow new data races on stores to be introduced." -msgstr "" +msgstr "Permet l'introduction de compétitions sur de nouvelles données durant un stockage." #: params.def:1065 #, no-c-format msgid "Set the maximum number of instructions executed in parallel in reassociated tree. If 0, use the target dependent heuristic.." -msgstr "" +msgstr "Fixe le nombre maximum d'instructions exécutées en parallèle dans un arbre ré-associé. Si 0, utilise l'heuristique dépendant de la cible." #: params.def:1071 #, no-c-format msgid "Maximum amount of similar bbs to compare a bb with." -msgstr "" +msgstr "La quantité maximale de bbs similaires à comparer avec un bb." #: params.def:1076 #, no-c-format msgid "Maximum amount of iterations of the pass over a function." -msgstr "" +msgstr "Le nombre maximum d'itérations du passage sur une fonction." #: params.def:1083 #, no-c-format msgid "Maximum number of strings for which strlen optimization pass will track string lengths." -msgstr "" +msgstr "Le nombre maximum de chaînes pour lesquelles l'optimisation de strlen conserve la longueur des chaînes." #: params.def:1090 #, no-c-format msgid "Which -fsched-pressure algorithm to apply." -msgstr "" +msgstr "Quel algorithme de -fsched-pressure appliquer." #: params.def:1096 -#, fuzzy, no-c-format -#| msgid "Perform strength reduction optimizations" +#, no-c-format msgid "Maximum length of candidate scans for straight-line strength reduction." -msgstr "Exécuter un réduction en force des optimisations" +msgstr "La longueur maximale de la recherche dans les candidats pour la simplification des instructions dupliquées lors de l'aplatissage d'une boucle (Straight-Line Strength Reduction)." #: params.def:1102 #, no-c-format msgid "Enable asan stack protection." -msgstr "Autoriser la protection de pile asan" +msgstr "Autoriser ASan à protéger la pile." #: params.def:1107 #, no-c-format msgid "Enable asan globals protection." -msgstr "Autoriser la protection des constantes asan." +msgstr "Autoriser ASan à protéger les globales." #: params.def:1112 -#, fuzzy, no-c-format -#| msgid "Enable parallel instructions" +#, no-c-format msgid "Enable asan store operations protection." -msgstr "Autoriser les instructions parallèles" +msgstr "Autoriser ASan à protéger les opérations d'écriture." #: params.def:1117 -#, fuzzy, no-c-format -#| msgid "Enable parallel instructions" +#, no-c-format msgid "Enable asan load operations protection." -msgstr "Autoriser les instructions parallèles" +msgstr "Autoriser ASan à protéger les opérations de chargement." #: params.def:1122 -#, fuzzy, no-c-format -#| msgid "Enable function profiling" +#, no-c-format msgid "Enable asan builtin functions protection." -msgstr "Autoriser le profilage de fonction" +msgstr "Autoriser ASan à protéger les fonctions intégrées." #: params.def:1127 #, no-c-format msgid "Enable asan detection of use-after-return bugs." -msgstr "" +msgstr "Autoriser la détection par ASan des bogues d'utilisation après retour." #: params.def:1132 #, no-c-format msgid "Use callbacks instead of inline code if number of accesses in function becomes greater or equal to this number." -msgstr "" +msgstr "Utiliser une fonction de rappel au lieu de code en ligne si le nombre d'accès dans la fonction devient plus grand ou égal à ce nombre." #: params.def:1138 #, no-c-format msgid "Maximum number of nested calls to search for control dependencies during uninitialized variable analysis." -msgstr "" +msgstr "Nombre maximum d'appels imbriqués pour rechercher une dépendance de contrôle pendant l'analyse d'une variable non initialisée." #: params.def:1144 #, no-c-format msgid "Maximum number of statements to be included into a single static constructor generated by Pointer Bounds Checker." -msgstr "" +msgstr "Le nombre maximum d'expressions à inclure dans un seul constructeur statique généré par Pointer Bounds Checker." #: params.def:1150 #, no-c-format msgid "Scale factor to apply to the number of statements in a threading path when comparing to the number of (scaled) blocks." -msgstr "" +msgstr "Facteur d'échelle à appliquer au nombre d'expressions dans un chemin de sauts enchaînés lors de la comparaison de blocs (mis à l'échelle)." #: params.def:1155 #, no-c-format msgid "Maximum number of arguments a PHI may have before the FSM threader will not try to thread through its block." -msgstr "" +msgstr "Le nombre maximum d'arguments qu'un PHI peut avoir avant que le FSM qui est responsable de l'enchaînement arrête d'essayer d'enchaîner les sauts dans ses blocs." #: params.def:1160 #, no-c-format msgid "Scale factor to apply to the number of blocks in a threading path when comparing to the number of (scaled) statements." -msgstr "" +msgstr "Facteur d'échelle à appliquer au nombre de blocs dans un chemin de sauts enchaînés lors de la comparaison avec le nombre d'expressions (mises à l'échelle)." #: params.def:1165 #, no-c-format msgid "Maximum number of instructions to copy when duplicating blocks on a finite state automaton jump thread path." -msgstr "" +msgstr "Le nombre maximum d'instructions à copier en dupliquant des blocs le long du chemin de sauts enchaînés d'un automate à états finis." #: params.def:1170 #, no-c-format msgid "Maximum number of basic blocks on a finite state automaton jump thread path." -msgstr "" +msgstr "Le nombre maximum de blocs de base sur le chemin de sauts enchaînés d'un automate à états finis." #: params.def:1175 -#, fuzzy, no-c-format -#| msgid "The maximum number of instructions to consider to fill a delay slot" +#, no-c-format msgid "Maximum number of new jump thread paths to create for a finite state automaton." -msgstr "Le nombre maximum d'instructions à considérer pour remplir une slot délai" +msgstr "Le nombre maximum de nouveaux chemins pour enchaîner des sauts pour un automate à états finis." #: params.def:1180 #, no-c-format msgid "Chunk size of omp schedule for loops parallelized by parloops." -msgstr "" +msgstr "Taille des fragments de l'ordonnanceur OMP pour des boucles parallélisées par parloops." #: params.def:1185 #, no-c-format msgid "Schedule type of omp schedule for loops parallelized by parloops (static, dynamic, guided, auto, runtime)." -msgstr "" +msgstr "Type d'ordonnancement de l'ordonnanceur OMP pour des boucles parallélisées par parloops (static, dynamic, guided, auto, runtime)." #: params.def:1192 #, no-c-format msgid "Maximum recursion depth allowed when querying a property of an SSA name." -msgstr "" +msgstr "La profondeur de récursion maximum permise en interrogeant une propriété d'un nom SSA." #: params.def:1198 -#, fuzzy, no-c-format -#| msgid "The maximum number of instructions in a single function eligible for inlining" +#, no-c-format msgid "Maximum number of insns in a basic block to consider for RTL if-conversion." -msgstr "Le nombre maximum d'instructions dans une fonction simple éligible au type enligne" +msgstr "Le nombre maximum d'insns dans un bloc de base pour considérer une conversion du if par RTL." #: params.def:1204 #, no-c-format msgid "Level of hsa debug stores verbosity" -msgstr "" +msgstr "Le niveau de verbosité des stockages de déboguage de hsa" #: params.def:1209 #, no-c-format msgid "Maximum number of may-defs visited when devirtualizing speculatively" -msgstr "" +msgstr "Le nombre maximum de may-defs visités lors d'une dévirtualisation spéculative" #: c-family/c-format.c:417 msgid "format" @@ -2729,7 +2720,7 @@ msgstr "fanion « q »" #: c-family/c-format.c:593 msgid "the 'q' diagnostic flag" -msgstr "le fanion « d » de diagnostique" +msgstr "le fanion « q » de diagnostique" #: c-family/c-format.c:606 config/i386/msformat-c.c:63 msgid "assignment suppression" @@ -2737,7 +2728,7 @@ msgstr "suppression d'affectation" #: c-family/c-format.c:606 config/i386/msformat-c.c:63 msgid "the assignment suppression scanf feature" -msgstr "options de scanf pour la suppression d'affectation" +msgstr "la fonctionnalité de suppression d'affectation de scanf" #: c-family/c-format.c:607 config/i386/msformat-c.c:64 msgid "'a' flag" @@ -2865,19 +2856,19 @@ msgstr "largeur de champ dans le format de strfmon" #: c-family/c-format.c:660 msgid "left precision" -msgstr "précision de gauche" +msgstr "précision à gauche" #: c-family/c-format.c:660 msgid "left precision in strfmon format" -msgstr "précision de gauche dans le format de strfmon" +msgstr "précision à gauche dans le format de strfmon" #: c-family/c-format.c:661 msgid "right precision" -msgstr "précision de droite" +msgstr "précision à droite" #: c-family/c-format.c:661 msgid "right precision in strfmon format" -msgstr "précision de droite dans le format de strfmon" +msgstr "précision à droite dans le format de strfmon" #: c-family/c-format.c:662 msgid "length modifier in strfmon format" @@ -2892,7 +2883,7 @@ msgstr "" #: config/arm/arm.c:21997 config/nios2/nios2.c:2642 #, c-format msgid "Unsupported operand for code '%c'" -msgstr "opérande non supportée pour le code « %c »" +msgstr "Opérande non supporté pour le code « %c »" #: config/aarch64/aarch64.c:4463 config/aarch64/aarch64.c:4479 #: config/aarch64/aarch64.c:4492 config/aarch64/aarch64.c:4504 @@ -2900,18 +2891,18 @@ msgstr "opérande non supportée pour le code « %c »" #: config/aarch64/aarch64.c:4591 config/aarch64/aarch64.c:4794 #, c-format msgid "invalid operand for '%%%c'" -msgstr "opérande invalide pour « %%%c »" +msgstr "opérande invalide pour « %%%c »" #: config/aarch64/aarch64.c:4558 config/aarch64/aarch64.c:4571 #: config/aarch64/aarch64.c:4581 #, c-format msgid "incompatible floating point / vector register operand for '%%%c'" -msgstr "" +msgstr "opérande en virgule flottante ou registre vecteur incompatible pour « %%%c »" #: config/aarch64/aarch64.c:4627 config/arm/arm.c:22504 #, c-format msgid "missing operand" -msgstr "opérande manquante" +msgstr "opérande manquant" #: config/aarch64/aarch64.c:4689 #, c-format @@ -2926,13 +2917,13 @@ msgstr "opérande invalide" #: config/aarch64/aarch64.c:4805 #, c-format msgid "invalid operand prefix '%%%c'" -msgstr "préfixe d'opérande invalide « %%%c »" +msgstr "préfixe d'opérande invalide « %%%c »" #: config/alpha/alpha.c:5102 config/i386/i386.c:17140 #: config/rs6000/rs6000.c:21150 config/sparc/sparc.c:8749 #, c-format msgid "'%%&' used without any local dynamic TLS references" -msgstr "" +msgstr "« %%& » utilisé sans référence à un TLS dynamique local" #: config/alpha/alpha.c:5160 config/bfin/bfin.c:1423 #, c-format @@ -3078,16 +3069,14 @@ msgid "invalid shift operand" msgstr "opérande shift invalide" #: config/arm/arm.c:21835 config/arm/arm.c:21853 -#, fuzzy, c-format -#| msgid "Generate char instructions" +#, c-format msgid "predicated Thumb instruction" -msgstr "Générer des instructions « char »" +msgstr "instruction Thumb établie" #: config/arm/arm.c:21841 -#, fuzzy, c-format -#| msgid "ret instruction not implemented" +#, c-format msgid "predicated instruction in conditional sequence" -msgstr "instruction ret n'est pas implantée" +msgstr "instruction établie dans la séquence conditionnelle" #: config/arm/arm.c:22074 config/arm/arm.c:22096 config/arm/arm.c:22106 #: config/arm/arm.c:22116 config/arm/arm.c:22126 config/arm/arm.c:22165 @@ -3113,66 +3102,53 @@ msgstr "instruction jamais exécutée" #: config/arm/arm.c:22199 #, c-format msgid "obsolete Maverick format code '%c'" -msgstr "" +msgstr "code de format Maverick « %c » obsolète" #: config/arm/arm.c:23618 -#, fuzzy -#| msgid "function returns an aggregate" msgid "function parameters cannot have __fp16 type" -msgstr "fonction retourne un agrégat" +msgstr "les paramètres de fonction ne peuvent pas avoir le type __fp16" #: config/arm/arm.c:23628 -#, fuzzy -#| msgid "function does not return string type" msgid "functions cannot return __fp16 type" -msgstr "fonction ne retourne pas un type « string »" +msgstr "les fonctions ne peuvent pas retourner le type __fp16" #: config/avr/avr.c:2124 -#, fuzzy, c-format -#| msgid "read-write constraint does not allow a register" +#, c-format msgid "address operand requires constraint for X, Y, or Z register" -msgstr "contrainte de lecture-écriture ne permet pas de registre" +msgstr "l'opérande d'adresse requiert une contrainte sur le registre X, Y ou Z" #: config/avr/avr.c:2282 -#, fuzzy -#| msgid "output operand %d must use `&' constraint" msgid "operands to %T/%t must be reg + const_int:" -msgstr "opérande de sortie %d doit utiliser la contrainte « & »" +msgstr "les opérandes de %T/%t doivent être reg + const_int:" #: config/avr/avr.c:2332 config/avr/avr.c:2399 -#, fuzzy -#| msgid "bad address, not (reg+disp):" msgid "bad address, not an I/O address:" -msgstr "adresse erronée, pas (reg+disp):" +msgstr "mauvaise adresse, pas une adresse E/S:" #: config/avr/avr.c:2341 -#, fuzzy -#| msgid "address offset not a constant" msgid "bad address, not a constant:" -msgstr "décalage d'adresse n'est pas une constante" +msgstr "mauvaise adresse, pas une constante:" #: config/avr/avr.c:2359 config/avr/avr.c:2366 msgid "bad address, not (reg+disp):" -msgstr "adresse erronée, pas (reg+disp):" +msgstr "mauvaise adresse, pas (reg+disp):" #: config/avr/avr.c:2373 -#, fuzzy -#| msgid "bad address, not (reg+disp):" msgid "bad address, not post_inc or pre_dec:" -msgstr "adresse erronée, pas (reg+disp):" +msgstr "mauvaise adresse, pas post_inc ou pre_dec:" #: config/avr/avr.c:2385 msgid "internal compiler error. Bad address:" -msgstr "erreur internal du compilateur. Adresse erronée:" +msgstr "erreur interne du compilateur. Mauvaise adresse:" #: config/avr/avr.c:2418 #, c-format msgid "Unsupported code '%c' for fixed-point:" -msgstr "" +msgstr "Code « %c » non supporté en virgule fixe:" #: config/avr/avr.c:2426 msgid "internal compiler error. Unknown mode:" -msgstr "erreur internal du compilateur. Mode inconnu:" +msgstr "erreur interne du compilateur. Mode inconnu:" #: config/avr/avr.c:3419 config/avr/avr.c:4349 config/avr/avr.c:4798 msgid "invalid insn:" @@ -3183,35 +3159,34 @@ msgstr "insn invalide :" #: config/avr/avr.c:4201 config/avr/avr.c:4485 config/avr/avr.c:4691 #: config/avr/avr.c:4855 config/avr/avr.c:4949 config/avr/avr.c:5145 msgid "incorrect insn:" -msgstr "insn incorrect :" +msgstr "insn incorrecte :" #: config/avr/avr.c:3717 config/avr/avr.c:3992 config/avr/avr.c:4272 #: config/avr/avr.c:4557 config/avr/avr.c:4737 config/avr/avr.c:5005 #: config/avr/avr.c:5203 msgid "unknown move insn:" -msgstr "insn de déplacement inconnu :" +msgstr "insn de déplacement inconnue :" #: config/avr/avr.c:5634 msgid "bad shift insn:" -msgstr "décalage insn erroné :" +msgstr "insn de décalage erronée :" #: config/avr/avr.c:5742 config/avr/avr.c:6223 config/avr/avr.c:6638 msgid "internal compiler error. Incorrect shift:" -msgstr "erreur internal du compilateur. Décalage incorrect:" +msgstr "erreur interne du compilateur. Décalage incorrect:" #: config/avr/avr.c:7975 -#, fuzzy -#| msgid "unsupported version" msgid "unsupported fixed-point conversion" -msgstr "version non reconnue" +msgstr "conversion en virgule fixe non supportée" #: config/avr/driver-avr.c:71 -#, fuzzy, c-format -#| msgid "unknown spec function `%s'" +#, c-format msgid "" "Running spec function '%s' with %d args\n" "\n" -msgstr "spécification de fonction inconnue « %s »:" +msgstr "" +"Exécution de la fonction spec « %s » avec %d args\n" +"\n" #: config/avr/driver-avr.c:118 #, c-format @@ -3220,6 +3195,9 @@ msgid "" "'%s': specfile='%s'\n" "\n" msgstr "" +"« %s »: mmcu=« %s »\n" +"« %s »: specfile=« %s »\n" +"\n" #: config/bfin/bfin.c:1385 #, c-format @@ -3254,16 +3232,12 @@ msgid "invalid operand for 'b' modifier" msgstr "opérande invalide pour le modificateur « b »" #: config/cris/cris.c:761 -#, fuzzy -#| msgid "invalid operand for 'b' modifier" msgid "invalid operand for 'o' modifier" -msgstr "opérande invalide pour le modificateur « b »" +msgstr "opérande invalide pour le modificateur « o »" #: config/cris/cris.c:780 -#, fuzzy -#| msgid "invalid operand for 'b' modifier" msgid "invalid operand for 'O' modifier" -msgstr "opérande invalide pour le modificateur « b »" +msgstr "opérande invalide pour le modificateur « O »" #: config/cris/cris.c:813 msgid "invalid operand for 'p' modifier" @@ -3279,7 +3253,7 @@ msgstr "opérande invalide pour le modificateur « H »" #: config/cris/cris.c:926 msgid "bad register" -msgstr "registre erroné" +msgstr "mauvais registre" #: config/cris/cris.c:970 msgid "invalid operand for 'e' modifier" @@ -3303,15 +3277,15 @@ msgstr "opérande invalide pour le modificateur « T »" #: config/cris/cris.c:1116 config/ft32/ft32.c:230 config/moxie/moxie.c:173 msgid "invalid operand modifier letter" -msgstr "opérande invalide pour le modificateur de lettre" +msgstr "lettre de modificateur d'opérande invalide" #: config/cris/cris.c:1170 msgid "unexpected multiplicative operand" -msgstr "opérande multiplicative inattendue" +msgstr "opérande multiplicatif inattendu" #: config/cris/cris.c:1190 config/ft32/ft32.c:253 config/moxie/moxie.c:198 msgid "unexpected operand" -msgstr "opérande inattendue" +msgstr "opérande inattendu" #: config/cris/cris.c:1229 config/cris/cris.c:1239 msgid "unrecognized address" @@ -3319,21 +3293,21 @@ msgstr "adresse non reconnue" #: config/cris/cris.c:2559 msgid "unrecognized supposed constant" -msgstr "supposée constante non reconnue" +msgstr "constante supposée non reconnue" #: config/cris/cris.c:2958 config/cris/cris.c:3016 msgid "unexpected side-effects in address" -msgstr "effets de bord inattendue dans l'adresse" +msgstr "effets de bord inattendus dans l'adresse" #. Can't possibly get anything else for a function-call, right? #: config/cris/cris.c:3844 msgid "unidentifiable call op" -msgstr "" +msgstr "opérande d'appel non identifiable" #: config/cris/cris.c:3906 #, c-format msgid "PIC register isn't set up" -msgstr "le registre n'est pas initialisé" +msgstr "le registre PIC n'est pas initialisé" #: config/fr30/fr30.c:496 #, c-format @@ -3343,12 +3317,12 @@ msgstr "fr30_print_operand_address: adresse non traitée" #: config/fr30/fr30.c:520 #, c-format msgid "fr30_print_operand: unrecognized %%p code" -msgstr "fr30_print_operand: code %%p non reconnue" +msgstr "fr30_print_operand: code %%p non reconnu" #: config/fr30/fr30.c:540 #, c-format msgid "fr30_print_operand: unrecognized %%b code" -msgstr "fr30_print_operand: code %%b non reconnue" +msgstr "fr30_print_operand: code %%b non reconnu" #: config/fr30/fr30.c:561 #, c-format @@ -3358,7 +3332,7 @@ msgstr "fr30_print_operand: code %%B non reconnu" #: config/fr30/fr30.c:569 #, c-format msgid "fr30_print_operand: invalid operand to %%A code" -msgstr "fr30_print_operand: opérande invalide pour code %%A" +msgstr "fr30_print_operand: opérande invalide pour le code %%A" #: config/fr30/fr30.c:586 #, c-format @@ -3382,23 +3356,17 @@ msgid "fr30_print_operand: unhandled MEM" msgstr "fr30_print_operand: MEM non traité" #: config/frv/frv.c:2507 -#, fuzzy -#| msgid "Bad insn to frv_print_operand_address:" msgid "bad insn to frv_print_operand_address:" -msgstr "insn erroné pour frv_print_operand_addresse:" +msgstr "mauvaise insn pour frv_print_operand_addresse:" #: config/frv/frv.c:2518 -#, fuzzy -#| msgid "Bad register to frv_print_operand_memory_reference_reg:" msgid "bad register to frv_print_operand_memory_reference_reg:" -msgstr "registre erroné pour frv_print_operand_memory_reference_reg:" +msgstr "mauvais registre pour frv_print_operand_memory_reference_reg:" #: config/frv/frv.c:2557 config/frv/frv.c:2567 config/frv/frv.c:2576 #: config/frv/frv.c:2597 config/frv/frv.c:2602 -#, fuzzy -#| msgid "Bad insn to frv_print_operand_memory_reference:" msgid "bad insn to frv_print_operand_memory_reference:" -msgstr "insn erroné pour frv_print_operand_memory_reference:" +msgstr "mauvaise insn pour frv_print_operand_memory_reference:" #: config/frv/frv.c:2688 #, c-format @@ -3406,92 +3374,64 @@ msgid "bad condition code" msgstr "mauvais code de condition" #: config/frv/frv.c:2762 -#, fuzzy -#| msgid "Bad insn in frv_print_operand, bad const_double" msgid "bad insn in frv_print_operand, bad const_double" -msgstr "insn erroné dans frv_print_operand, bad const_double" +msgstr "mauvaise insn dans frv_print_operand, mauvais const_double" #: config/frv/frv.c:2823 -#, fuzzy -#| msgid "Bad insn to frv_print_operand, 'e' modifier:" msgid "bad insn to frv_print_operand, 'e' modifier:" -msgstr "insn erroné pour frv_print_operand, modificateur « e »:" +msgstr "mauvaise insn pour frv_print_operand, modificateur « e »:" #: config/frv/frv.c:2831 -#, fuzzy -#| msgid "Bad insn to frv_print_operand, 'F' modifier:" msgid "bad insn to frv_print_operand, 'F' modifier:" -msgstr "insn erroné pour frv_print_operand, modificateur « F »:" +msgstr "mauvaise insn pour frv_print_operand, modificateur « F »:" #: config/frv/frv.c:2847 -#, fuzzy -#| msgid "Bad insn to frv_print_operand, 'f' modifier:" msgid "bad insn to frv_print_operand, 'f' modifier:" -msgstr "insn erroné pour frv_print_operand, modificateur « f »:" +msgstr "mauvaise insn pour frv_print_operand, modificateur « f »:" #: config/frv/frv.c:2861 -#, fuzzy -#| msgid "Bad insn to frv_print_operand, 'C' modifier:" msgid "bad insn to frv_print_operand, 'g' modifier:" -msgstr "insn erroné pour frv_print_operand, modificateur « C »:" +msgstr "mauvaise insn pour frv_print_operand, modificateur « g »:" #: config/frv/frv.c:2909 -#, fuzzy -#| msgid "Bad insn to frv_print_operand, 'L' modifier:" msgid "bad insn to frv_print_operand, 'L' modifier:" -msgstr "insn erroné pour frv_print_operand, modificateur « L »:" +msgstr "mauvaise insn pour frv_print_operand, modificateur « L »:" #: config/frv/frv.c:2922 -#, fuzzy -#| msgid "Bad insn to frv_print_operand, 'M/N' modifier:" msgid "bad insn to frv_print_operand, 'M/N' modifier:" -msgstr "insn erroné pour frv_print_operand, modificateur « M/N »:" +msgstr "mauvaise insn pour frv_print_operand, modificateur « M/N »:" #: config/frv/frv.c:2943 -#, fuzzy -#| msgid "Bad insn to frv_print_operand, 'O' modifier:" msgid "bad insn to frv_print_operand, 'O' modifier:" -msgstr "insn erroné pour frv_print_operand, modificateur « O »:" +msgstr "mauvaise insn pour frv_print_operand, modificateur « O »:" #: config/frv/frv.c:2961 -#, fuzzy -#| msgid "Bad insn to frv_print_operand, P modifier:" msgid "bad insn to frv_print_operand, P modifier:" -msgstr "insn erroné pour frv_print_operand, modificateur « P »:" +msgstr "mauvaise insn pour frv_print_operand, modificateur « P »:" #: config/frv/frv.c:2981 -#, fuzzy -#| msgid "Bad insn in frv_print_operand, z case" msgid "bad insn in frv_print_operand, z case" -msgstr "insn erroné dans frv_print_operand, case z" +msgstr "mauvaise insn dans frv_print_operand, case z" #: config/frv/frv.c:3012 -#, fuzzy -#| msgid "Bad insn in frv_print_operand, 0 case" msgid "bad insn in frv_print_operand, 0 case" -msgstr "insn erroné dans frv_print_operand, case 0" +msgstr "mauvaise insn dans frv_print_operand, case 0" #: config/frv/frv.c:3017 msgid "frv_print_operand: unknown code" msgstr "frv_print_operand: code inconnu" #: config/frv/frv.c:4421 -#, fuzzy -#| msgid "Bad output_move_single operand" msgid "bad output_move_single operand" -msgstr "opérande output_move_single erronée" +msgstr "mauvais opérande output_move_single" #: config/frv/frv.c:4548 -#, fuzzy -#| msgid "Bad output_move_double operand" msgid "bad output_move_double operand" -msgstr "opérande output_move_double erronée" +msgstr "mauvais opérande output_move_double" #: config/frv/frv.c:4690 -#, fuzzy -#| msgid "Bad output_condmove_single operand" msgid "bad output_condmove_single operand" -msgstr "opérande output_condmove_single erronée" +msgstr "mauvais opérande output_condmove_single" #: config/i386/i386.c:16060 #, c-format @@ -3516,7 +3456,7 @@ msgstr "type opérande invalide utilisé avec le code d'opérande « Z »" #: config/i386/i386.c:16874 #, c-format msgid "invalid operand size for operand code 'Z'" -msgstr "taille opérande invalide pour le code d'opérande « Z »" +msgstr "taille d'opérande invalide pour le code d'opérande « Z »" #: config/i386/i386.c:16950 #, c-format @@ -3534,10 +3474,9 @@ msgid "operand is not a condition code, invalid operand code '%c'" msgstr "l'opérande n'est pas du code de condition, code d'opérande invalide « %c »" #: config/i386/i386.c:17053 -#, fuzzy, c-format -#| msgid "operand is neither a constant nor a condition code, invalid operand code 'c'" +#, c-format msgid "operand is not an offsettable memory reference, invalid operand code 'H'" -msgstr "l'opérande n'est ni une constante ni du code de condition, code d'opérande invalide « c »" +msgstr "l'opérande n'est pas une référence mémoire avec décalage, code d'opérande invalide « H »" #: config/i386/i386.c:17218 #, c-format @@ -3554,10 +3493,9 @@ msgid "unknown insn mode" msgstr "mode insn inconnu" #: config/i386/djgpp.h:146 -#, fuzzy, c-format -#| msgid "-f%s not supported: ignored" +#, c-format msgid "-f%s ignored (not supported for DJGPP)\n" -msgstr "-f%s n'est pas supporté: ignoré" +msgstr "-f%s ignoré (pas supporté par DJGPP)\n" #: config/i386/i386-interix.h:77 msgid "Use native (MS) bitfield layout" @@ -3608,15 +3546,15 @@ msgstr "utilisation invalide de %%d, %%x, ou %%X" #: config/lm32/lm32.c:507 #, c-format msgid "only 0.0 can be loaded as an immediate" -msgstr "" +msgstr "seul 0.0 peut être chargé comme valeur immédiate" #: config/lm32/lm32.c:577 msgid "bad operand" -msgstr "opérande erronée" +msgstr "mauvais opérande" #: config/lm32/lm32.c:589 msgid "can't use non gp relative absolute address" -msgstr "" +msgstr "impossible d'utiliser une adresse absolue pas relative à gp" #: config/lm32/lm32.c:593 msgid "invalid addressing mode" @@ -3634,7 +3572,7 @@ msgstr "opérande invalide pour le code %%p" #: config/m32r/m32r.c:2137 msgid "bad insn for 'A'" -msgstr "insn erroné pour « A »" +msgstr "mauvaise insn pour « A »" #: config/m32r/m32r.c:2184 #, c-format @@ -3648,20 +3586,20 @@ msgstr "opérande invalide pour le code %%N" #: config/m32r/m32r.c:2240 msgid "pre-increment address is not a register" -msgstr "pré-incrément d'adresse n'est pas un registre" +msgstr "le pré-incrément d'adresse n'est pas un registre" #: config/m32r/m32r.c:2247 msgid "pre-decrement address is not a register" -msgstr "pré-décrément d'adresse n'est pas un registre" +msgstr "le pré-décrément d'adresse n'est pas un registre" #: config/m32r/m32r.c:2254 msgid "post-increment address is not a register" -msgstr "post-incrément d'adresse n'est pas un registre" +msgstr "le post-incrément d'adresse n'est pas un registre" #: config/m32r/m32r.c:2328 config/m32r/m32r.c:2343 #: config/rs6000/rs6000.c:32640 msgid "bad address" -msgstr "adresse erronée" +msgstr "mauvaise adresse" #: config/m32r/m32r.c:2348 msgid "lo_sum not of register" @@ -3685,12 +3623,12 @@ msgstr "pointeur null" #: config/microblaze/microblaze.c:2234 #, c-format msgid "PRINT_OPERAND, invalid insn for %%C" -msgstr "PRINT_OPERAND insn invalide pour %%C" +msgstr "PRINT_OPERAND, insn invalide pour %%C" #: config/microblaze/microblaze.c:2263 #, c-format msgid "PRINT_OPERAND, invalid insn for %%N" -msgstr "PRINT_OPERAND insn invalide pour %%N" +msgstr "PRINT_OPERAND, insn invalide pour %%N" #: config/microblaze/microblaze.c:2283 config/microblaze/microblaze.c:2458 msgid "insn contains an invalid address !" @@ -3720,44 +3658,44 @@ msgstr "utilisation invalide de « %%%c »" #: config/mmix/mmix.c:1547 config/mmix/mmix.c:1677 msgid "MMIX Internal: Expected a CONST_INT, not this" -msgstr "MMIX interne: attendait CONST_INT, pas ceci" +msgstr "MMIX interne: Attendait CONST_INT, pas ceci" #: config/mmix/mmix.c:1626 msgid "MMIX Internal: Bad value for 'm', not a CONST_INT" -msgstr "MMIX interne: valeur erronée pour « m », pas un CONST_INT" +msgstr "MMIX interne: Valeur erronée pour « m », pas un CONST_INT" #: config/mmix/mmix.c:1645 msgid "MMIX Internal: Expected a register, not this" -msgstr "MMIX interne: attendait un registre, pas ceci" +msgstr "MMIX interne: Attendait un registre, pas ceci" #: config/mmix/mmix.c:1655 msgid "MMIX Internal: Expected a constant, not this" -msgstr "MMIX interne: attendait une constante, pas ceci" +msgstr "MMIX interne: Attendait une constante, pas ceci" #. We need the original here. #: config/mmix/mmix.c:1739 msgid "MMIX Internal: Cannot decode this operand" -msgstr "MMIX interne: ne peut décoder cette opérande" +msgstr "MMIX interne: Ne peut décoder cet opérande" #: config/mmix/mmix.c:1795 msgid "MMIX Internal: This is not a recognized address" -msgstr "MMIX interne: ce n'est pas une adresse reconnue" +msgstr "MMIX interne: Ce n'est pas une adresse reconnue" #: config/mmix/mmix.c:2671 msgid "MMIX Internal: Trying to output invalidly reversed condition:" -msgstr "MMIX interne: tentative de produire incorrectement une condition renversée:" +msgstr "MMIX interne: Tentative de produire une condition incorrectement inversée:" #: config/mmix/mmix.c:2678 msgid "MMIX Internal: What's the CC of this?" -msgstr "MMIX interne: quel sorte de CC est-ce?" +msgstr "MMIX interne: Quel est le CC de ceci ?" #: config/mmix/mmix.c:2682 msgid "MMIX Internal: What is the CC of this?" -msgstr "MMIX interne: quel sorte de CC est-ce?" +msgstr "MMIX interne: Quel est le CC de ceci ?" #: config/mmix/mmix.c:2724 msgid "MMIX Internal: This is not a constant:" -msgstr "interne MMIX: ce n'est pas une constante:" +msgstr "MMIX interne: Ce n'est pas une constante:" #: config/msp430/msp430.c:3609 #, c-format @@ -3767,12 +3705,12 @@ msgstr "préfixe d'opérande invalide" #: config/msp430/msp430.c:3643 #, c-format msgid "invalid zero extract" -msgstr "" +msgstr "« zero extract » invalide" #: config/rl78/rl78.c:1797 config/rl78/rl78.c:1883 #, c-format msgid "q/Q modifiers invalid for symbol references" -msgstr "" +msgstr "modificateurs q/Q invalides pour des références de symboles" #: config/rs6000/host-darwin.c:94 #, c-format @@ -3782,11 +3720,11 @@ msgstr "Manque d'espace sur la pile.\n" #: config/rs6000/host-darwin.c:115 #, c-format msgid "Try running '%s' in the shell to raise its limit.\n" -msgstr "Essayer d'exécuter « %s » dans le shell pour augmenter la limite.\n" +msgstr "Essayez d'exécuter « %s » dans le shell pour augmenter sa limite.\n" #: config/rs6000/rs6000.c:3959 msgid "-maltivec=le not allowed for big-endian targets" -msgstr "" +msgstr "-maltivec=le pas permis pour des cibles gros-boutistes" #: config/rs6000/rs6000.c:3971 msgid "-mvsx requires hardware floating point" @@ -3794,7 +3732,7 @@ msgstr "-mvsx nécessite une unité matérielle en virgule flottante" #: config/rs6000/rs6000.c:3979 msgid "-mvsx and -mpaired are incompatible" -msgstr "-mvsx et -mpaired -msystem-v sont incompatibles" +msgstr "-mvsx et -mpaired sont incompatibles" #: config/rs6000/rs6000.c:3981 msgid "-mvsx needs indexed addressing" @@ -3814,29 +3752,27 @@ msgstr "-mquad-memory requiert le mode 64 bits" #: config/rs6000/rs6000.c:4132 msgid "-mquad-memory-atomic requires 64-bit mode" -msgstr "" +msgstr "-mquad-memory-atomic requiert le mode 64 bits" #: config/rs6000/rs6000.c:4144 msgid "-mquad-memory is not available in little endian mode" -msgstr "" +msgstr "-mquad-memory n'est pas disponible en mode petit-boutiste" #: config/rs6000/rs6000.c:4212 -#, fuzzy -#| msgid "-mquad-memory requires 64-bit mode" msgid "-mtoc-fusion requires 64-bit" -msgstr "-mquad-memory requiert le mode 64 bits" +msgstr "-mtoc-fusion requiert le mode 64 bits" #: config/rs6000/rs6000.c:4219 msgid "-mtoc-fusion requires medium/large code model" -msgstr "" +msgstr "-mtoc-fusion requiert le modèle de code moyen/large" #: config/rs6000/rs6000.c:9919 msgid "bad move" -msgstr "mauvais mouvement" +msgstr "mauvais déplacement" #: config/rs6000/rs6000.c:20411 msgid "Bad 128-bit move" -msgstr "" +msgstr "Mauvais déplacement sur 128 bits" #: config/rs6000/rs6000.c:20602 #, c-format @@ -3909,78 +3845,71 @@ msgid "invalid %%x value" msgstr "valeur %%x invalide" #: config/rs6000/rs6000.c:21099 -#, fuzzy, c-format -#| msgid "invalid punctuation `%c' in constraint" +#, c-format msgid "invalid %%y value, try using the 'Z' constraint" -msgstr "ponctuation invalide « %c » dans la contrainte" +msgstr "valeur %%y invalide, essayez d'utiliser la contrainte « Z »" #: config/rs6000/rs6000.c:21814 msgid "__float128 and __ibm128 cannot be used in the same expression" -msgstr "" +msgstr "__float128 et __ibm128 ne peuvent pas être utilisés dans la même expression" #: config/rs6000/rs6000.c:21820 msgid "__ibm128 and long double cannot be used in the same expression" -msgstr "" +msgstr "__ibm128 et long double ne peuvent pas être utilisés dans la même expression" #: config/rs6000/rs6000.c:21826 msgid "__float128 and long double cannot be used in the same expression" -msgstr "" +msgstr "__float128 et long double ne peuvent pas être utilisés dans la même expression" #: config/rs6000/rs6000.c:35706 -#, fuzzy -#| msgid "too few arguments to function" msgid "AltiVec argument passed to unprototyped function" -msgstr "trop peu d'arguments pour la fonction" +msgstr "Argument AltiVec passé à une fonction sans prototype" #: config/rs6000/rs6000.c:37429 -#, fuzzy -#| msgid "Do not generate code for a Sun FPA" msgid "Could not generate addis value for fusion" -msgstr "Ne pas générer le code pour un Sun FPA" +msgstr "N'a pu générer de valeur addis pour la fusion" #: config/rs6000/rs6000.c:37501 -#, fuzzy -#| msgid "unable to generate reloads for:" msgid "Unable to generate load/store offset for fusion" -msgstr "incapable de générer des recharges pour:" +msgstr "Impossible de générer un offset load/store pour la fusion" #: config/rs6000/rs6000.c:37605 msgid "Bad GPR fusion" -msgstr "" +msgstr "Mauvaise fusion GPR" #: config/rs6000/rs6000.c:37823 msgid "emit_fusion_p9_load, bad reg #1" -msgstr "" +msgstr "emit_fusion_p9_load, mauvais reg #1" #: config/rs6000/rs6000.c:37860 msgid "emit_fusion_p9_load, bad reg #2" -msgstr "" +msgstr "emit_fusion_p9_load, mauvais reg #2" #: config/rs6000/rs6000.c:37863 msgid "emit_fusion_p9_load not MEM" -msgstr "" +msgstr "emit_fusion_p9_load pas MEM" #: config/rs6000/rs6000.c:37901 msgid "emit_fusion_p9_store, bad reg #1" -msgstr "" +msgstr "emit_fusion_p9_store, mauvais reg #1" #: config/rs6000/rs6000.c:37938 msgid "emit_fusion_p9_store, bad reg #2" -msgstr "" +msgstr "emit_fusion_p9_store, mauvais reg #2" #: config/rs6000/rs6000.c:37941 msgid "emit_fusion_p9_store not MEM" -msgstr "" +msgstr "emit_fusion_p9_store pas MEM" #: config/s390/s390.c:7168 #, c-format msgid "symbolic memory references are only supported on z10 or later" -msgstr "" +msgstr "les références mémoire symboliques sont uniquement supportées sur z10 ou ultérieur" #: config/s390/s390.c:7179 #, c-format msgid "cannot decompose address" -msgstr "Adresse indécomposable" +msgstr "adresse indécomposable" #: config/s390/s390.c:7248 #, c-format @@ -4005,7 +3934,7 @@ msgstr "adresse invalide pour le modificateur de sortie « R »" #: config/s390/s390.c:7329 #, c-format msgid "memory reference expected for 'S' output modifier" -msgstr "" +msgstr "référence mémoire attendue pour le modificateur de sortie « S »" #: config/s390/s390.c:7339 #, c-format @@ -4015,12 +3944,12 @@ msgstr "adresse invalide pour le modificateur de sortie « S »" #: config/s390/s390.c:7360 #, c-format msgid "register or memory expression expected for 'N' output modifier" -msgstr "" +msgstr "registre ou expression mémoire attendue pour le modificateur de sortie « N »" #: config/s390/s390.c:7371 #, c-format msgid "register or memory expression expected for 'M' output modifier" -msgstr "" +msgstr "registre ou expression mémoire attendue pour le modificateur de sortie « M »" #: config/s390/s390.c:7456 config/s390/s390.c:7477 #, c-format @@ -4030,13 +3959,12 @@ msgstr "constante invalide pour le modificateur de sortie « %c »" #: config/s390/s390.c:7474 #, c-format msgid "invalid constant - try using an output modifier" -msgstr "opérande invalide - essayez un modificateur de sortie" +msgstr "constante invalide - essayez un modificateur de sortie" #: config/s390/s390.c:7515 -#, fuzzy, c-format -#| msgid "invalid constant for output modifier '%c'" +#, c-format msgid "invalid constant vector for output modifier '%c'" -msgstr "constante invalide pour le modificateur de sortie « %c »" +msgstr "vecteur constant invalide pour le modificateur de sortie « %c »" #: config/s390/s390.c:7522 #, c-format @@ -4049,30 +3977,24 @@ msgid "invalid expression for output modifier '%c'" msgstr "expression invalide pour le modificateur de sortie « %c »" #: config/s390/s390.c:11377 -#, fuzzy -#| msgid "too few arguments to function" msgid "Vector argument passed to unprototyped function" -msgstr "trop peu d'arguments pour la fonction" +msgstr "Vecteur passé en argument à une fonction sans prototype" #: config/s390/s390.c:15036 -#, fuzzy -#| msgid "pointer targets in %s differ in signedness" msgid "types differ in signess" -msgstr "les cibles des pointeurs dans %s n'ont pas toutes de signe" +msgstr "les types diffèrent sur le type signé/non-signé" #: config/s390/s390.c:15046 msgid "binary operator does not support two vector bool operands" -msgstr "" +msgstr "l'opérateur binaire ne supporte pas deux opérandes booléens vectoriels" #: config/s390/s390.c:15049 -#, fuzzy -#| msgid "target format does not support infinity" msgid "binary operator does not support vector bool operand" -msgstr "le format cible ne supporte pas l'infini" +msgstr "l'opérateur binaire ne supporte pas l'opérande booléen vectoriel" #: config/s390/s390.c:15057 msgid "binary operator does not support mixing vector bool with floating point vector operands" -msgstr "" +msgstr "l'opérateur binaire ne supporte pas le mélange d'un booléen vectoriel avec un vecteur en virgule flottante" #: config/sh/sh.c:1313 #, c-format @@ -4109,7 +4031,7 @@ msgstr "opérande %%A invalide" #: config/sparc/sparc.c:8844 #, c-format msgid "invalid %%B operand" -msgstr "Opérande %%B invalide" +msgstr "opérande %%B invalide" #: config/sparc/sparc.c:8873 config/tilegx/tilegx.c:5095 #: config/tilepro/tilepro.c:4510 @@ -4133,25 +4055,24 @@ msgid "invalid %%s operand" msgstr "opérande %%s invalide" #: config/sparc/sparc.c:8963 -#, fuzzy, c-format -#| msgid "floating point constant not a valid immediate operand" +#, c-format msgid "floating-point constant not a valid immediate operand" -msgstr "constante en virgule flottante n'est pas une opérande immédiate valide" +msgstr "la constante en virgule flottante n'est pas un opérande immédiat valide" #: config/stormy16/stormy16.c:1733 config/stormy16/stormy16.c:1804 #, c-format msgid "'B' operand is not constant" -msgstr "opérande « B » n'est pas une constante" +msgstr "l'opérande « B » n'est pas une constante" #: config/stormy16/stormy16.c:1760 #, c-format msgid "'B' operand has multiple bits set" -msgstr "l'opérande « B » a de multiples jeux de bits" +msgstr "l'opérande « B » a plusieurs bits activés" #: config/stormy16/stormy16.c:1786 #, c-format msgid "'o' operand is not constant" -msgstr "opérande « o » n'est pas une constante" +msgstr "l'opérande « o » n'est pas une constante" #: config/stormy16/stormy16.c:1818 #, c-format @@ -4206,12 +4127,12 @@ msgstr "opérande %%N invalide" #: config/tilegx/tilegx.c:5385 #, c-format msgid "invalid operand for 'r' specifier" -msgstr "opérande invalide pour le modificateur « r »" +msgstr "opérande invalide pour le spécificateur « r »" #: config/tilegx/tilegx.c:5409 config/tilepro/tilepro.c:4816 #, c-format msgid "unable to print out operand yet; code == %d (%c)" -msgstr "" +msgstr "impossible d'imprimer l'opérande pour l'instant; code == %d (%c)" #: config/tilepro/tilepro.c:4560 #, c-format @@ -4245,7 +4166,7 @@ msgstr "opérande %%r invalide" #: config/v850/v850.c:293 msgid "const_double_split got a bad insn:" -msgstr "const_double_split a reçu un insn erroné :" +msgstr "const_double_split a reçu une mauvaise insn :" #: config/v850/v850.c:899 msgid "output_move_single:" @@ -4254,27 +4175,25 @@ msgstr "output_move_single :" #: config/vax/vax.c:453 #, c-format msgid "symbol used with both base and indexed registers" -msgstr "" +msgstr "symbole utilisé conjointement avec des registres de base et d'index" #: config/vax/vax.c:462 -#, fuzzy, c-format -#| msgid "code model %s not supported in PIC mode" +#, c-format msgid "symbol with offset used in PIC mode" -msgstr "model de code %s n'est pas supporté en mode PIC" +msgstr "symbole avec offset utilisé en mode PIC" #: config/vax/vax.c:550 -#, fuzzy, c-format -#| msgid "long long constant not a valid immediate operand" +#, c-format msgid "symbol used as immediate operand" -msgstr "constante long long n'est pas une opérande immédiate valide" +msgstr "symbole utilisé comme opérande immédiat" #: config/vax/vax.c:1577 msgid "illegal operand detected" -msgstr "opérande illégale détectée" +msgstr "opérande illégal détecté" #: config/visium/visium.c:3255 msgid "illegal operand " -msgstr "opérande illégale " +msgstr "opérande illégal " #: config/visium/visium.c:3306 msgid "illegal operand address (1)" @@ -4295,7 +4214,7 @@ msgstr "adresse d'opérande illégale (4)" #: config/xtensa/xtensa.c:768 config/xtensa/xtensa.c:800 #: config/xtensa/xtensa.c:809 msgid "bad test" -msgstr "test erroné" +msgstr "mauvais test" #: config/xtensa/xtensa.c:2301 #, c-format @@ -4322,7 +4241,7 @@ msgstr "pas de registre dans l'adresse" #: config/xtensa/xtensa.c:2487 msgid "address offset not a constant" -msgstr "décalage d'adresse n'est pas une constante" +msgstr "le décalage d'adresse n'est pas une constante" #: c/c-objc-common.c:160 msgid "aka" @@ -4406,7 +4325,7 @@ msgstr "%<]%> attendu" #: c/c-parser.c:3759 msgid "expected %<;%>, %<,%> or %<)%>" -msgstr "%<;%>, %<,%> or %<)%> attendu" +msgstr "%<;%>, %<,%> ou %<)%> attendu" #: c/c-parser.c:4372 c/c-parser.c:14517 cp/parser.c:26967 cp/parser.c:28889 #, gcc-internal-format @@ -4430,19 +4349,19 @@ msgstr "%<:%> attendu" #: c/c-parser.c:5185 cp/semantics.c:613 msgid "Cilk array notation cannot be used as a computed goto expression" -msgstr "" +msgstr "La notation Cilk d'un tableau ne peut pas être utilisée comme expression d'un goto calculé" #: c/c-parser.c:5244 msgid "Cilk array notation cannot be used for a throw expression" -msgstr "" +msgstr "La notation Cilk d'un tableau ne peut pas être utilisée comme expression pour déclencher une exception" #: c/c-parser.c:5556 cp/semantics.c:1136 msgid "Cilk array notation cannot be used as a condition for switch statement" -msgstr "" +msgstr "La notation Cilk d'un tableau ne peut pas être utilisée comme condition d'un switch" #: c/c-parser.c:5605 cp/semantics.c:791 msgid "Cilk array notation cannot be used as a condition for while statement" -msgstr "" +msgstr "La notation Cilk d'un tableau ne peut pas être utilisée comme condition d'un while" #: c/c-parser.c:5656 cp/parser.c:26897 #, gcc-internal-format @@ -4451,11 +4370,11 @@ msgstr "% attendu" #: c/c-parser.c:5663 cp/semantics.c:850 msgid "Cilk array notation cannot be used as a condition for a do-while statement" -msgstr "" +msgstr "La notation Cilk d'un tableau ne peut pas être utilisée comme condition d'un do-while" #: c/c-parser.c:5866 cp/semantics.c:969 msgid "Cilk array notation cannot be used in a condition for a for-loop" -msgstr "" +msgstr "La notation Cilk d'un tableau ne peut pas être utilisée comme condition d'une boucle for" #: c/c-parser.c:7497 msgid "expected %<.%>" @@ -4475,7 +4394,7 @@ msgstr "%<>%> attendu" #: c/c-parser.c:12116 c/c-parser.c:12880 cp/parser.c:27012 #, gcc-internal-format msgid "expected %<,%> or %<)%>" -msgstr "%<,%> or %<)%> attendu" +msgstr "%<,%> ou %<)%> attendu" #: c/c-parser.c:14229 c/c-parser.c:14273 c/c-parser.c:14501 c/c-parser.c:14736 #: c/c-parser.c:16891 c/c-parser.c:17513 c/c-parser.c:4573 cp/parser.c:26991 @@ -4486,7 +4405,7 @@ msgstr "%<=%> attendu" #: c/c-parser.c:15280 c/c-parser.c:15270 cp/parser.c:34132 #, gcc-internal-format msgid "expected %<#pragma omp section%> or %<}%>" -msgstr "%<#pragma omp section%> or %<}%> attendu" +msgstr "%<#pragma omp section%> ou %<}%> attendu" #: c/c-parser.c:17675 c/c-parser.c:10602 cp/parser.c:26976 cp/parser.c:30031 #, gcc-internal-format @@ -4507,23 +4426,23 @@ msgstr "candidat 2 :" #: cp/decl2.c:778 msgid "candidates are: %+#D" -msgstr "candidats sont : %+#D" +msgstr "les candidats sont : %+#D" #: cp/decl2.c:780 msgid "candidate is: %+#D" -msgstr "candidat est : %+#D" +msgstr "le candidat est : %+#D" #: cp/error.c:317 msgid "" -msgstr "" +msgstr "" #: cp/error.c:417 msgid "" -msgstr "" +msgstr "" #: cp/error.c:419 msgid "" -msgstr "" +msgstr "" #: cp/error.c:581 msgid "" @@ -4532,7 +4451,7 @@ msgstr "" #: cp/error.c:684 #, c-format msgid "" -msgstr "" +msgstr "<%s anonyme>" #. A lambda's "type" is essentially its signature. #: cp/error.c:689 @@ -4541,29 +4460,29 @@ msgstr "" -msgstr "" +msgstr "" #: cp/error.c:948 #, c-format msgid "(static initializers for %s)" -msgstr "(initialiseur static pour « %s »)" +msgstr "(initialiseurs statiques pour %s)" #: cp/error.c:950 #, c-format msgid "(static destructors for %s)" -msgstr "(destructeur static pour %s)" +msgstr "(destructeurs statiques pour %s)" #: cp/error.c:1063 msgid "vtable for " -msgstr "" +msgstr "vtable pour " #: cp/error.c:1087 msgid " " -msgstr "" +msgstr " " #: cp/error.c:1102 msgid "{anonymous}" -msgstr "{anonymous}" +msgstr "{anonyme}" #: cp/error.c:1104 msgid "(anonymous namespace)" @@ -4571,7 +4490,7 @@ msgstr "(espace de nom anonyme)" #: cp/error.c:1220 msgid "