aboutsummaryrefslogtreecommitdiff
path: root/libgomp
diff options
context:
space:
mode:
authorMartin Jambor <mjambor@suse.cz>2016-11-23 15:51:02 +0100
committerMartin Jambor <jamborm@gcc.gnu.org>2016-11-23 15:51:02 +0100
commit56b1c60e412fcf1245b4780871553cbdebb956a3 (patch)
tree3a3e101ec1a0e1bdd140db82245f5884d841c62f /libgomp
parentf6cdfe826444e1a0b52b271588fbef5c2a4bac4d (diff)
downloadgcc-56b1c60e412fcf1245b4780871553cbdebb956a3.zip
gcc-56b1c60e412fcf1245b4780871553cbdebb956a3.tar.gz
gcc-56b1c60e412fcf1245b4780871553cbdebb956a3.tar.bz2
backport: hsa-builtins.def: New file.
Merge from HSA branch to trunk 2016-11-23 Martin Jambor <mjambor@suse.cz> Martin Liska <mliska@suse.cz> gcc/ * hsa-builtins.def: New file. * Makefile.in (BUILTINS_DEF): Add hsa-builtins.def dependency. * builtins.def: Include hsa-builtins.def. (DEF_HSA_BUILTIN): New macro. * dumpfile.h (OPTGROUP_OPENMP): Define. * dumpfile.c (optgroup_options): Added OPTGROUP_OPENMP. * gimple.h (gf_mask): Added elements GF_OMP_FOR_GRID_INTRA_GROUP and GF_OMP_FOR_GRID_GROUP_ITER. (gimple_omp_for_grid_phony): Added checking assert. (gimple_omp_for_set_grid_phony): Likewise. (gimple_omp_for_grid_intra_group): New function. (gimple_omp_for_set_grid_intra_group): Likewise. (gimple_omp_for_grid_group_iter): Likewise. (gimple_omp_for_set_grid_group_iter): Likewise. * omp-low.c (check_omp_nesting_restrictions): Allow GRID loop where previosuly only distribute loop was permitted. (lower_lastprivate_clauses): Allow non tcc_comparison predicates. (grid_get_kernel_launch_attributes): Support multiple HSA grid dimensions. (grid_expand_omp_for_loop): Likewise and also support standalone distribute constructs. New parameter INTRA_GROUP, updated both users. (grid_expand_target_grid_body): Support standalone distribute constructs. (pass_data_expand_omp): Changed optinfo_flags to OPTGROUP_OPENMP. (pass_data_expand_omp_ssa): Likewise. (pass_data_omp_device_lower): Likewsie. (pass_data_lower_omp): Likewise. (pass_data_diagnose_omp_blocks): Likewise. (pass_data_oacc_device_lower): Likewise. (pass_data_omp_target_link): Likewise. (grid_lastprivate_predicate): New function. (lower_omp_for_lastprivate): Call grid_lastprivate_predicate for gridified loops. (lower_omp_for): Support standalone distribute constructs. (grid_prop): New type. (grid_safe_assignment_p): Check for assignments to group_sizes, new parameter GRID. (grid_seq_only_contains_local_assignments): New parameter GRID, pass it to callee. (grid_find_single_omp_among_assignments_1): Likewise, improve missed optimization info messages. (grid_find_single_omp_among_assignments): Likewise. (grid_find_ungridifiable_statement): Do not bail out for SIMDs. (grid_parallel_clauses_gridifiable): New function. (grid_inner_loop_gridifiable_p): Likewise. (grid_dist_follows_simple_pattern): Likewise. (grid_gfor_follows_tiling_pattern): Likewise. (grid_call_permissible_in_distribute_p): Likewise. (grid_handle_call_in_distribute): Likewise. (grid_dist_follows_tiling_pattern): Likewise. (grid_target_follows_gridifiable_pattern): Support standalone distribute constructs. (grid_var_segment): New enum. (grid_mark_variable_segment): New function. (grid_copy_leading_local_assignments): Call grid_mark_variable_segment if a new argument says so. (grid_process_grid_body): New function. (grid_eliminate_combined_simd_part): Likewise. (grid_mark_tiling_loops): Likewise. (grid_mark_tiling_parallels_and_loops): Likewise. (grid_process_kernel_body_copy): Support standalone distribute constructs. (grid_attempt_target_gridification): New grid variable holding overall gridification state. Support standalone distribute constructs and collapse clauses. * doc/optinfo.texi (Optimization groups): Document OPTGROUP_OPENMP. * hsa.h (hsa_bb): Add method method append_phi. (hsa_insn_br): Renamed to hsa_insn_cbr, renamed all occurences in all files too. (hsa_insn_br): New class, now the ancestor of hsa_incn_cbr. (is_a_helper <hsa_insn_br *>::test): New function. (is_a_helper <hsa_insn_cbr *>::test): Adjust to only cover conditional branch instructions. (hsa_insn_signal): Make a direct descendant of hsa_insn_basic. Add memorder constructor parameter and m_memory_order and m_signalop member variables. (hsa_insn_queue): Changed constructor parameters to common form. Added m_segment and m_memory_order member variables. (hsa_summary_t): Add private member function process_gpu_implementation_attributes. (hsa_function_summary): Rename m_binded_function to m_bound_function. (hsa_insn_basic_p): Remove typedef. (hsa_op_with_type): Change hsa_insn_basic_p into plain pointers. (hsa_op_reg_p): Remove typedef. (hsa_function_representation): Change hsa_op_reg_p into plain pointers. (hsa_insn_phi): Removed new and delete operators. (hsa_insn_br): Likewise. (hsa_insn_cbr): Likewise. (hsa_insn_sbr): Likewise. (hsa_insn_cmp): Likewise. (hsa_insn_mem): Likewise. (hsa_insn_atomic): Likewise. (hsa_insn_signal): Likewise. (hsa_insn_seg): Likewise. (hsa_insn_call): Likewise. (hsa_insn_arg_block): Likewise. (hsa_insn_comment): Likewise. (hsa_insn_srctype): Likewise. (hsa_insn_packed): Likewise. (hsa_insn_cvt): Likewise. (hsa_insn_alloca): Likewise. * hsa.c (hsa_destroy_insn): Also handle instances of hsa_insn_br. (process_gpu_implementation_attributes): New function. (link_functions): Move some functionality into it. Adjust after renaming m_binded_functions to m_bound_functions. (hsa_insn_basic::op_output_p): Add BRIG_OPCODE_DEBUGTRAP to the list of instructions with no output registers. (get_in_type): Return this if it is a register of matching size. (hsa_get_declaration_name): Moved to... * hsa-gen.c (hsa_get_declaration_name): ...here. Allocate temporary string on an obstack instead from ggc. (query_hsa_grid): Renamed to query_hsa_grid_dim, reimplemented, cut down to two overloads. (hsa_allocp_operand_address): Removed. (hsa_allocp_operand_immed): Likewise. (hsa_allocp_operand_reg): Likewise. (hsa_allocp_operand_code_list): Likewise. (hsa_allocp_operand_operand_list): Likewise. (hsa_allocp_inst_basic): Likewise. (hsa_allocp_inst_phi): Likewise. (hsa_allocp_inst_mem): Likewise. (hsa_allocp_inst_atomic): Likewise. (hsa_allocp_inst_signal): Likewise. (hsa_allocp_inst_seg): Likewise. (hsa_allocp_inst_cmp): Likewise. (hsa_allocp_inst_br): Likewise. (hsa_allocp_inst_sbr): Likewise. (hsa_allocp_inst_call): Likewise. (hsa_allocp_inst_arg_block): Likewise. (hsa_allocp_inst_comment): Likewise. (hsa_allocp_inst_queue): Likewise. (hsa_allocp_inst_srctype): Likewise. (hsa_allocp_inst_packed): Likewise. (hsa_allocp_inst_cvt): Likewise. (hsa_allocp_inst_alloca): Likewise. (hsa_allocp_bb): Likewise. (hsa_obstack): New. (hsa_init_data_for_cfun): Initialize obstack. (hsa_deinit_data_for_cfun): Release memory of the obstack. (hsa_op_immed::operator new): Use obstack instead of object_allocator. (hsa_op_reg::operator new): Likewise. (hsa_op_address::operator new): Likewise. (hsa_op_code_list::operator new): Likewise. (hsa_op_operand_list::operator new): Likewise. (hsa_insn_basic::operator new): Likewise. (hsa_insn_phi::operator new): Likewise. (hsa_insn_br::operator new): Likewise. (hsa_insn_sbr::operator new): Likewise. (hsa_insn_cmp::operator new): Likewise. (hsa_insn_mem::operator new): Likewise. (hsa_insn_atomic::operator new): Likewise. (hsa_insn_signal::operator new): Likewise. (hsa_insn_seg::operator new): Likewise. (hsa_insn_call::operator new): Likewise. (hsa_insn_arg_block::operator new): Likewise. (hsa_insn_comment::operator new): Likewise. (hsa_insn_srctype::operator new): Likewise. (hsa_insn_packed::operator new): Likewise. (hsa_insn_cvt::operator new): Likewise. (hsa_insn_alloca::operator new): Likewise. (hsa_init_new_bb): Likewise. (hsa_bb::append_phi): New function. (gen_hsa_phi_from_gimple_phi): Use it. (get_symbol_for_decl): Fix dinstinguishing between global and local functions. Put local variables into a segment according to their attribute or static flag, if there is one. (hsa_insn_br::hsa_insn_br): New. (hsa_insn_br::operator new): Likewise. (hsa_insn_cbr::hsa_insn_cbr): Set width via ancestor constructor. (query_hsa_grid_nodim): New function. (multiply_grid_dim_characteristics): Likewise. (gen_get_num_threads): Likewise. (gen_get_num_teams): Reimplemented. (gen_get_team_num): Likewise. (gen_hsa_insns_for_known_library_call): Updated calls to the above helper functions. (get_memory_order_name): Removed. (get_memory_order): Likewise. (hsa_memorder_from_tree): New function. (gen_hsa_ternary_atomic_for_builtin): Renamed to gen_hsa_atomic_for_builtin, can also create signals. (gen_hsa_insns_for_call): Handle many new builtins. Adjust to use hsa_memory_order_from_tree and gen_hsa_atomic_for_builtin. (hsa_insn_atomic): Fix function comment. (hsa_insn_signal::hsa_insn_signal): Fix comment. Update call to ancestor constructor and initialization of new member variables. (hsa_insn_queue::hsa_insn_queue): Added initialization of new member variables. (hsa_get_host_function): Handle functions with no bound CPU implementation. Fix binded to bound. (get_brig_function_name): Likewise. (HSA_SORRY_ATV): Remove semicolon after macro. (HSA_SORRY_AT): Likewise. (omp_simple_builtin::generate): Add missing semicolons. (hsa_insn_phi::operator new): Removed. (hsa_insn_br::operator new): Likewise. (hsa_insn_cbr::operator new): Likewise. (hsa_insn_sbr::operator new): Likewise. (hsa_insn_cmp::operator new): Likewise. (hsa_insn_mem::operator new): Likewise. (hsa_insn_atomic::operator new): Likewise. (hsa_insn_signal::operator new): Likewise. (hsa_insn_seg::operator new): Likewise. (hsa_insn_call::operator new): Likewise. (hsa_insn_arg_block::operator new): Likewise. (hsa_insn_comment::operator new): Likewise. (hsa_insn_srctype::operator new): Likewise. (hsa_insn_packed::operator new): Likewise. (hsa_insn_cvt::operator new): Likewise. (hsa_insn_alloca::operator new): Likewise. (get_symbol_for_decl): Accept CONST_DECLs, put them to readonly segment. (gen_hsa_addr): Also process CONST_DECLs. (gen_hsa_addr_insns): Process CONST_DECLs by creating private copies. (gen_hsa_unary_operation): Make sure the function does not use bittype source type for firstbit and lastbit operations. (gen_hsa_popcount_to_dest): Make sure the function uses a bittype source type. * hsa-brig.c (emit_insn_operands): Cope with zero operands in an instruction. (emit_branch_insn): Renamed to emit_cond_branch_insn. Emit the width stored in the class. (emit_generic_branch_insn): New function. (emit_insn): Call emit_generic_branch_insn. (emit_signal_insn): Remove obsolete comment. Update member variable name, pick a type according to profile. (emit_alloca_insn): Remove obsolete comment. (emit_atomic_insn): Likewise. (emit_queue_insn): Get segment and memory order from the IR object. (hsa_brig_section): Make allocate_new_chunk, chunks and cur_chunk provate, add a default NULL parameter to add method. (hsa_brig_section::add): Added a new parameter, store pointer to output data there if it is non-NULL. (emit_function_directives): Use this new parameter instead of calculating the pointer itself, fix function comment. (hsa_brig_emit_function): Add forgotten endian conversion. (hsa_output_kernels): Remove unnecessary building of kernel_dependencies_vector_type. (emit_immediate_operand): Declare. (emit_directive_variable): Also emit initializers of CONST_DECLs. (gen_hsa_insn_for_internal_fn_call): Also handle IFN_RSQRT. (verify_function_arguments): Properly detect variadic arguments. * hsa-dump.c (hsa_width_specifier_name): New function. (dump_hsa_insn_1): Dump generic branch instructions, update signal member variable name. Special dumping for queue objects. * ipa-hsa.c (process_hsa_functions): Adjust after renaming m_binded_functions to m_bound_functions. Copy externally visible flag to the node. (ipa_hsa_write_summary): Likewise. (ipa_hsa_read_section): Likewise. gcc/fortran/ * f95-lang.c (DEF_HSA_BUILTIN): New macro. gcc/testsuite/ * c-c++-common/gomp/gridify-1.c: Update scan string. * gfortran.dg/gomp/gridify-1.f90: Likewise. * c-c++-common/gomp/gridify-2.c: New test. * c-c++-common/gomp/gridify-3.c: Likewise. libgomp/ * testsuite/libgomp.hsa.c/bits-insns.c: New test. * testsuite/libgomp.hsa.c/tiling-1.c: Likewise. * testsuite/libgomp.hsa.c/tiling-2.c: Likewise. Co-Authored-By: Martin Liska <mliska@suse.cz> From-SVN: r242761
Diffstat (limited to 'libgomp')
-rw-r--r--libgomp/ChangeLog6
-rw-r--r--libgomp/testsuite/libgomp.hsa.c/bits-insns.c73
-rw-r--r--libgomp/testsuite/libgomp.hsa.c/tiling-1.c212
-rw-r--r--libgomp/testsuite/libgomp.hsa.c/tiling-2.c258
4 files changed, 549 insertions, 0 deletions
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 19d8039..16781f9 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,9 @@
+2016-11-23 Martin Jambor <mjambor@suse.cz>
+
+ * testsuite/libgomp.hsa.c/bits-insns.c: New test.
+ * testsuite/libgomp.hsa.c/tiling-1.c: Likewise.
+ * testsuite/libgomp.hsa.c/tiling-2.c: Likewise.
+
2016-11-23 Martin Liska <mliska@suse.cz>
Martin Jambor <mjambor@suse.cz>
diff --git a/libgomp/testsuite/libgomp.hsa.c/bits-insns.c b/libgomp/testsuite/libgomp.hsa.c/bits-insns.c
new file mode 100644
index 0000000..21cac72
--- /dev/null
+++ b/libgomp/testsuite/libgomp.hsa.c/bits-insns.c
@@ -0,0 +1,73 @@
+#include <math.h>
+
+#define N 12
+
+int main()
+{
+ unsigned int arguments[N] = {0u, 1u, 2u, 3u, 111u, 333u, 444u, 0x80000000u, 0x0000ffffu, 0xf0000000u, 0xff000000u, 0xffffffffu};
+ int clrsb[N] = {};
+ int clz[N] = {};
+ int ctz[N] = {};
+ int ffs[N] = {};
+ int parity[N] = {};
+ int popcount[N] = {};
+
+ int ref_clrsb[N] = {};
+ int ref_clz[N] = {};
+ int ref_ctz[N] = {};
+ int ref_ffs[N] = {};
+ int ref_parity[N] = {};
+ int ref_popcount[N] = {};
+
+ for (unsigned i = 0; i < N; i++)
+ {
+ ref_clrsb[i] = __builtin_clrsb (arguments[i]);
+ ref_clz[i] = __builtin_clz (arguments[i]);
+ ref_ctz[i] = __builtin_ctz (arguments[i]);
+ ref_ffs[i] = __builtin_ffs (arguments[i]);
+ ref_parity[i] = __builtin_parity (arguments[i]);
+ ref_popcount[i] = __builtin_popcount (arguments[i]);
+ }
+
+ #pragma omp target map(from:clz, ctz, ffs, parity, popcount)
+ {
+ for (unsigned i = 0; i < N; i++)
+ {
+ clrsb[i] = __builtin_clrsb (arguments[i]);
+ clz[i] = __builtin_clz (arguments[i]);
+ ctz[i] = __builtin_ctz (arguments[i]);
+ ffs[i] = __builtin_ffs (arguments[i]);
+ parity[i] = __builtin_parity (arguments[i]);
+ popcount[i] = __builtin_popcount (arguments[i]);
+ }
+ }
+
+ for (unsigned i = 0; i < N; i++)
+ if (ref_clrsb[i] != clrsb[i])
+ __builtin_abort ();
+
+ /* CLZ of zero is undefined for zero. */
+ for (unsigned i = 1; i < N; i++)
+ if (ref_clz[i] != clz[i])
+ __builtin_abort ();
+
+ /* Likewise for ctz */
+ for (unsigned i = 1; i < N; i++)
+ if (ref_ctz[i] != ctz[i])
+ __builtin_abort ();
+
+ for (unsigned i = 0; i < N; i++)
+ if (ref_ffs[i] != ffs[i])
+ __builtin_abort ();
+
+ for (unsigned i = 0; i < N; i++)
+ if (ref_parity[i] != parity[i])
+ __builtin_abort ();
+
+ for (unsigned i = 0; i < N; i++)
+ if (ref_popcount[i] != popcount[i])
+ __builtin_abort ();
+
+ return 0;
+}
+
diff --git a/libgomp/testsuite/libgomp.hsa.c/tiling-1.c b/libgomp/testsuite/libgomp.hsa.c/tiling-1.c
new file mode 100644
index 0000000..9149adc
--- /dev/null
+++ b/libgomp/testsuite/libgomp.hsa.c/tiling-1.c
@@ -0,0 +1,212 @@
+/*
+
+ matmul.c : Matrix Multiplication with tiling for openmp4 example
+
+*/
+
+#include <stdlib.h>
+#include <math.h>
+
+#define BLOCK_SIZE 16
+/*
+ #define BLOCK_SIZE 32
+*/
+#define NSECPERSEC 1000000000L
+
+typedef struct {
+ int width;
+ int height;
+ int stride;
+ int hpad;
+ float* elements;
+} Matrix;
+
+/* Correctly extract the number of nanoseconds from the two time structures */
+long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
+ long int nanosecs;
+ if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
+ ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
+ ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
+ else nanosecs =
+ (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
+ ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
+ return nanosecs;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+
+int verify(float* v_res, float* v_ref, int len) {
+ int passed = 1;
+ int i;
+ for (i = 0; i < len; ++i) {
+ if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
+ __builtin_abort ();
+ }
+ }
+ return passed;
+}
+
+
+int main(int argc, char* argv[]){
+
+ Matrix A,B,Bt,C,Cref;
+ int a1,a2,a3,i,j;
+ struct timespec start_time1, end_time1;
+ struct timespec start_time2, end_time2;
+ long int nanosecs,total_ops;
+ float gflopsTiled,gflopsCPU;
+
+ a1 = 35;
+ a2 = 28;
+ a3 = 47;
+
+ A.height = a1;
+ A.width = a2;
+ A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
+
+ B.height = a2;
+ B.width = a3;
+ B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
+
+ /* Bt is same as B but stored in column-major order */
+ Bt.height = B.height;
+ Bt.width = B.width;
+ Bt.stride = B.stride;
+ Bt.hpad = B.hpad;
+ Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
+
+ C.height = a1;
+ C.width = a3;
+ C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
+
+ Cref.height = a1;
+ Cref.width = a3;
+ Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
+
+ for(i = 0; i < A.hpad ; i++)
+ for(j = 0; j < A.stride; j++) {
+ if (( j<A.width ) && (i<A.height)) {
+ A.elements[i*A.stride + j] = (i % 3);
+ } else {
+ A.elements[i*A.stride + j] = 0.0;
+ }
+ }
+
+ /* Initialize B and Bt */
+ for(i = 0; i < B.hpad ; i++)
+ for(j = 0; j < B.stride; j++) {
+ if (( j<B.width ) && (i<B.height)) {
+ B.elements[i*B.stride+j] = (j % 2);
+ Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
+ } else {
+ B.elements[i*B.stride+j] = 0.0;
+ Bt.elements[j*Bt.stride+i] = 0.0;
+ }
+ }
+
+ /* zero C, and Cref */
+ for(i = 0; i < C.hpad; i++)
+ for(j = 0; j < C.stride; j++) {
+ C.elements[i*C.stride+j] = 0.0;
+ Cref.elements[i*Cref.stride+j] = 0.0;
+ }
+
+ simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
+ tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
+
+ verify(C.elements, Cref.elements, C.height * C.stride);
+ return 0;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+const float* B,const int LDB, const float beta,float* C, const int LDC) {
+ /* A,B, and C are in row-major order */
+ int c_row,c_col,inner;
+ float sum;
+ for (c_col = 0 ; c_col<N; c_col++ ) {
+ for (c_row = 0 ; c_row<M; c_row++ ) {
+ sum = 0.0 ;
+ for (inner = 0 ; inner<K; inner++ ) {
+ sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
+ }
+ C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
+ }
+ }
+}
+
+/***************************
+
+ tiled_sgemm_tt: Tiled matrix multiplication:
+
+***************************/
+
+void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
+ const float*B, const int LDB, const float beta, float*C, const int LDC){
+
+#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
+#pragma omp distribute collapse(2)
+ for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE)
+ for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE)
+ {
+// Each team has a local copy of these mini matrices
+ float As[BLOCK_SIZE][BLOCK_SIZE];
+ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+#pragma omp parallel
+ {
+ int C_row, C_col;
+ float Cval = 0.0;
+
+ for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE )
+ {
+#pragma omp for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (kblock + col < K))
+ As[row][col] = A[(C_row*LDA)+ kblock + col];
+ else
+ As[row][col] = 0;
+ if ((kblock + row < K) && C_col < N)
+ Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
+ else
+ Bs[row][col] = 0;
+ }
+
+#pragma omp for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ for (int e = 0; e < BLOCK_SIZE; ++e)
+ Cval += As[row][e] * Bs[e][col];
+ }
+ } /* End for kblock .. */
+
+
+#pragma omp for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++)
+ for (int col=0 ; col < BLOCK_SIZE ; col++)
+ {
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (C_col < N))
+ C[(C_row*LDC)+C_col] = alpha*Cval + beta*C[(C_row*LDC)+C_col];
+
+ }
+ } /* end parallel */
+ } /* end target teams distribute */
+}
diff --git a/libgomp/testsuite/libgomp.hsa.c/tiling-2.c b/libgomp/testsuite/libgomp.hsa.c/tiling-2.c
new file mode 100644
index 0000000..6e54304
--- /dev/null
+++ b/libgomp/testsuite/libgomp.hsa.c/tiling-2.c
@@ -0,0 +1,258 @@
+/*
+
+ matmul.c : Matrix Multiplication with tiling for openmp4 example
+
+*/
+
+#include <stdlib.h>
+#include <math.h>
+
+#define BLOCK_SIZE 16
+/*
+ #define BLOCK_SIZE 32
+*/
+#define NSECPERSEC 1000000000L
+
+typedef struct {
+ int width;
+ int height;
+ int stride;
+ int hpad;
+ float* elements;
+} Matrix;
+
+/* Correctly extract the number of nanoseconds from the two time structures */
+long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
+ long int nanosecs;
+ if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
+ ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
+ ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
+ else nanosecs =
+ (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
+ ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
+ return nanosecs;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
+ const float* B,const int LDB, const float beta,float* C, const int LDC) ;
+
+int verify(float* v_res, float* v_ref, int len) {
+ int passed = 1;
+ int i;
+ for (i = 0; i < len; ++i) {
+ if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
+ __builtin_abort ();
+ }
+ }
+ return passed;
+}
+
+
+int main(int argc, char* argv[]){
+
+ Matrix A,B,Bt,C,Cref;
+ int a1,a2,a3,i,j;
+ struct timespec start_time1, end_time1;
+ struct timespec start_time2, end_time2;
+ long int nanosecs,total_ops;
+ float gflopsTiled,gflopsCPU;
+
+ a1 = 35;
+ a2 = 28;
+ a3 = 47;
+
+ A.height = a1;
+ A.width = a2;
+ A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));
+
+ B.height = a2;
+ B.width = a3;
+ B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));
+
+ /* Bt is same as B but stored in column-major order */
+ Bt.height = B.height;
+ Bt.width = B.width;
+ Bt.stride = B.stride;
+ Bt.hpad = B.hpad;
+ Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));
+
+ C.height = a1;
+ C.width = a3;
+ C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));
+
+ Cref.height = a1;
+ Cref.width = a3;
+ Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
+ Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));
+
+ for(i = 0; i < A.hpad ; i++)
+ for(j = 0; j < A.stride; j++) {
+ if (( j<A.width ) && (i<A.height)) {
+ A.elements[i*A.stride + j] = (i % 3);
+ } else {
+ A.elements[i*A.stride + j] = 0.0;
+ }
+ }
+
+ /* Initialize B and Bt */
+ for(i = 0; i < B.hpad ; i++)
+ for(j = 0; j < B.stride; j++) {
+ if (( j<B.width ) && (i<B.height)) {
+ B.elements[i*B.stride+j] = (j % 2);
+ Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
+ } else {
+ B.elements[i*B.stride+j] = 0.0;
+ Bt.elements[j*Bt.stride+i] = 0.0;
+ }
+ }
+
+ /* zero C, and Cref */
+ for(i = 0; i < C.hpad; i++)
+ for(j = 0; j < C.stride; j++) {
+ C.elements[i*C.stride+j] = 0.0;
+ Cref.elements[i*Cref.stride+j] = 0.0;
+ }
+
+ simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
+ tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);
+
+ verify(C.elements, Cref.elements, C.height * C.stride);
+ return 0;
+}
+
+void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
+const float* B,const int LDB, const float beta,float* C, const int LDC) {
+ /* A,B, and C are in row-major order */
+ int c_row,c_col,inner;
+ float sum;
+ for (c_col = 0 ; c_col<N; c_col++ ) {
+ for (c_row = 0 ; c_row<M; c_row++ ) {
+ sum = 0.0 ;
+ for (inner = 0 ; inner<K; inner++ ) {
+ sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
+ }
+ C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
+ }
+ }
+}
+
+/***************************
+
+ tiled_sgemm_tt: Tiled matrix multiplication:
+
+***************************/
+
+void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
+ const float*B, const int LDB, const float beta, float*C, const int LDC){
+
+#pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
+#pragma omp distribute collapse(2)
+ for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) {
+ for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) {
+
+// We now have M/BLOCK_SIZE * N/BLOCK_SIZE teams = (M*N)/(BLOCK_SIZE*BLOCK_SIZE)
+// The grid global dimensions are M,N,1
+// The grid local dimensions are BLOCK_SIZE,BLOCK_SIZE,1
+
+// -------------------------------------------------------------------
+// The rest of this code forms the HSAIL kernel with the
+// pairs of "paralell for collapse(2)" loops repalced with a barrier.
+// The kernel initializes these values
+// C_row_start = get_group_id(0) * BLOCK_SIZE
+// C_col_start = get_group_id(1) * BLOCK_SIZE
+// row=get_local_id(0)
+// col=get_local_id(1)
+// -------------------------------------------------------------------
+
+// Each team has a local copy of these mini matrices
+ float As[BLOCK_SIZE][BLOCK_SIZE];
+ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+ float Cs[BLOCK_SIZE][BLOCK_SIZE];
+ int C_row, C_col;
+
+ /* Zero Cs for this BLOCK */
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++) {
+ for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+ Cs[row][col] = 0.0;
+ }
+ }
+
+ // This kblock loop is run on the master thread of each team
+ for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE ) {
+
+ // Copy global memory values to local memory
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++) {
+ for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (kblock + col < K))
+ As[row][col] = A[(C_row*LDA)+ kblock + col];
+ else
+ As[row][col] = 0;
+ if ((kblock + row < K) && C_col < N)
+ Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
+ else
+ Bs[row][col] = 0;
+ }
+ }
+
+ // Calculate Cs <- Sum(As X Bs) across all kblocks
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++) {
+ for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+ for (int e = 0; e < BLOCK_SIZE; ++e)
+ Cs[row][col] += As[row][e] * Bs[e][col];
+ }
+ }
+
+ } /* End for kblock .. */
+
+
+ // Scale Update actual C from Cs
+// - - - - - - - - - - - - - - - - - - - -
+// REPLACE NEXT THREE LINES WITH A BARRIER
+#pragma omp parallel for collapse(2)
+ for (int row=0 ; row < BLOCK_SIZE ; row++) {
+ for (int col=0 ; col < BLOCK_SIZE ; col++) {
+// END BARRIER
+// - - - - - - - - - - - - - - - - - - - -
+ C_row = C_row_start + row;
+ C_col = C_col_start + col;
+ if ((C_row < M) && (C_col < N)) {
+ C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
+ }
+ }
+ }
+
+// -------------------------------------------------------------------
+// This is the end of the kernel
+
+ }
+ }
+
+}