diff options
author | Pekka Jääskeläinen <visit0r@gcc.gnu.org> | 2018-05-04 19:43:57 +0000 |
---|---|---|
committer | Pekka Jääskeläinen <visit0r@gcc.gnu.org> | 2018-05-04 19:43:57 +0000 |
commit | 080dc24383a602a5a4095eb05b04100f15ba1ad4 (patch) | |
tree | a0d2a78948f718293527ac76c53bdee3303d989f /libhsail-rt | |
parent | 1e25c5a9bb7042d7bca5a4fa840666dcb16f4918 (diff) | |
download | gcc-080dc24383a602a5a4095eb05b04100f15ba1ad4.zip gcc-080dc24383a602a5a4095eb05b04100f15ba1ad4.tar.gz gcc-080dc24383a602a5a4095eb05b04100f15ba1ad4.tar.bz2 |
[BRIGFE] phsa-specific optimizations
Add flag -fassume-phsa that is on by default. If -fno-assume-phsa
is given, these optimizations are disabled.
With this flag, gccbrig can generate GENERIC that assumes we are
targeting a phsa-runtime based implementation, which allows us
to expose the work-item context accesses to retrieve WI IDs etc.
which helps optimizers.
First optimization that takes advantage of this is to get rid of
the setworkitemid calls whenever we have non-inlined calls that
use IDs internally.
Other optimizations added in this commit:
- expand absoluteid to similar level of simplicity as workitemid.
At the moment absoluteid is the best indexing ID to end up with
WG vectorization.
- propagate ID variables closer to their uses. This is mainly
to avoid known useless casts, which confuse at least scalar
evolution analysis.
- use signed long long for storing IDs. Unsigned integers have
defined wraparound semantics, which confuse at least scalar
evolution analysis, leading to unvectorizable WI loops.
- also refactor some BRIG function generation helpers to brig_function.
- no point in having the wi-loop as a for-loop. It's really
a do...while and SCEV can analyze it just fine still.
- add consts to ptrs etc. in BRIG builtin defs.
Improves optimization opportunities.
- add qualifiers to generated function parameters.
Const and restrict on the hidden local/private pointers,
the arg buffer and the context pointer help some optimizations.
From-SVN: r259957
Diffstat (limited to 'libhsail-rt')
-rw-r--r-- | libhsail-rt/ChangeLog | 7 | ||||
-rw-r--r-- | libhsail-rt/include/internal/phsa-rt.h | 1 | ||||
-rw-r--r-- | libhsail-rt/include/internal/workitems.h | 50 | ||||
-rw-r--r-- | libhsail-rt/rt/workitems.c | 84 |
4 files changed, 108 insertions, 34 deletions
diff --git a/libhsail-rt/ChangeLog b/libhsail-rt/ChangeLog index 5ab9e85..17aeb6e4 100644 --- a/libhsail-rt/ChangeLog +++ b/libhsail-rt/ChangeLog @@ -1,5 +1,12 @@ 2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com> + * include/internal/phsa-rt.h: Whitespace cleanup. + * include/internal/workitems.h: Store work item ID data to easily + accessible locations. + * rt/workitems.c: Same. + +2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com> + * rt/workitems.c: Fix an alloca stack underflow. 2018-04-18 David Malcolm <dmalcolm@redhat.com> diff --git a/libhsail-rt/include/internal/phsa-rt.h b/libhsail-rt/include/internal/phsa-rt.h index d9db56c..c09f18d 100644 --- a/libhsail-rt/include/internal/phsa-rt.h +++ b/libhsail-rt/include/internal/phsa-rt.h @@ -54,7 +54,6 @@ typedef void (*gccbrigKernelFunc) (unsigned char *, void *, void *, uint32_t, */ typedef struct { - /* Data set by the HSA Runtime's kernel launcher. */ hsa_kernel_dispatch_packet_t *dp; diff --git a/libhsail-rt/include/internal/workitems.h b/libhsail-rt/include/internal/workitems.h index 73add28..0839853f 100644 --- a/libhsail-rt/include/internal/workitems.h +++ b/libhsail-rt/include/internal/workitems.h @@ -45,11 +45,6 @@ typedef struct { - /* The group id of the currently executed WG. */ - size_t x; - size_t y; - size_t z; - /* This is 1 in case there are more work groups to execute. If 0, the work-item threads should finish themselves. */ int more_wgs; @@ -89,6 +84,16 @@ typedef struct stack frame. Initialized to point outside the private segment. */ uint32_t alloca_frame_p; + /* The group id of the currently executed WG. This is for fiber based + execution. The group ids are duplicated also to the per WI context + struct for simplified single pointer access in the GCCBRIG produced + code. + */ + + uint32_t x; + uint32_t y; + uint32_t z; + } PHSAWorkGroup; /* Data identifying a single work-item, passed to the work-item thread in case @@ -96,17 +101,42 @@ typedef struct typedef struct { + /* NOTE: These members STARTing here should not be moved as they are + accessed directly by code emitted by BRIG FE. */ + + /* The local id of the current WI. */ + + uint32_t x; + uint32_t y; + uint32_t z; + + /* The group id of the currently executed WG. */ + + uint32_t group_x; + uint32_t group_y; + uint32_t group_z; + + /* The local size of a complete WG. */ + + uint32_t wg_size_x; + uint32_t wg_size_y; + uint32_t wg_size_z; + + /* The local size of the current WG. */ + + uint32_t cur_wg_size_x; + uint32_t cur_wg_size_y; + uint32_t cur_wg_size_z; + + /* NOTE: Fixed members END here. */ + PHSAKernelLaunchData *launch_data; /* Identifies and keeps book of the currently executed WG of the WI swarm. */ volatile PHSAWorkGroup *wg; - /* The local id of the current WI. */ - size_t x; - size_t y; - size_t z; #ifdef HAVE_FIBERS fiber_t fiber; #endif -} PHSAWorkItem; +} __attribute__((packed)) PHSAWorkItem; #endif diff --git a/libhsail-rt/rt/workitems.c b/libhsail-rt/rt/workitems.c index 36c9169..c846350 100644 --- a/libhsail-rt/rt/workitems.c +++ b/libhsail-rt/rt/workitems.c @@ -107,11 +107,20 @@ phsa_work_item_thread (int arg0, int arg1) the current_work_group_* is set to point to the WG executed next. */ if (!wi->wg->more_wgs) break; + + wi->group_x = wg->x; + wi->group_y = wg->y; + wi->group_z = wg->z; + + wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); + wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); + wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); + #ifdef DEBUG_PHSA_RT printf ( "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n", - wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x, - l_data->wg_max_y, l_data->wg_max_z); + wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z, + l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z); #endif if (wi->x < __hsail_currentworkgroupsize (0, wi) @@ -180,6 +189,13 @@ phsa_work_item_thread (int arg0, int arg1) else wg->x++; #endif + wi->group_x = wg->x; + wi->group_y = wg->y; + wi->group_z = wg->z; + + wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); + wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); + wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); /* Reinitialize the work-group barrier according to the new WG's size, which might not be the same as the previous ones, due @@ -233,6 +249,7 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, PHSAWorkItem *wi_threads = NULL; PHSAWorkGroup wg; size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z; + uint32_t group_x, group_y, group_z; fiber_barrier_t wg_start_barrier; fiber_barrier_t wg_completion_barrier; fiber_barrier_t wg_sync_barrier; @@ -257,13 +274,13 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, wg.initial_group_offset = group_local_offset; #ifdef EXECUTE_WGS_BACKWARDS - wg.x = context->wg_max_x - 1; - wg.y = context->wg_max_y - 1; - wg.z = context->wg_max_z - 1; + group_x = context->wg_max_x - 1; + group_y = context->wg_max_y - 1; + group_z = context->wg_max_z - 1; #else - wg.x = context->wg_min_x; - wg.y = context->wg_min_y; - wg.z = context->wg_min_z; + group_x = context->wg_min_x; + group_y = context->wg_min_y; + group_z = context->wg_min_z; #endif fiber_barrier_init (&wg_sync_barrier, wg_size); @@ -290,6 +307,19 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, PHSAWorkItem *wi = &wi_threads[flat_wi_id]; wi->launch_data = context; wi->wg = &wg; + + wg.x = wi->group_x = group_x; + wg.y = wi->group_y = group_y; + wg.z = wi->group_z = group_z; + + wi->wg_size_x = context->dp->workgroup_size_x; + wi->wg_size_y = context->dp->workgroup_size_y; + wi->wg_size_z = context->dp->workgroup_size_z; + + wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); + wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); + wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); + wi->x = x; wi->y = y; wi->z = z; @@ -467,9 +497,17 @@ phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr, for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y) for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x) { - wi.wg->x = wg_x; - wi.wg->y = wg_y; - wi.wg->z = wg_z; + wi.group_x = wg_x; + wi.group_y = wg_y; + wi.group_z = wg_z; + + wi.wg_size_x = context->dp->workgroup_size_x; + wi.wg_size_y = context->dp->workgroup_size_y; + wi.wg_size_z = context->dp->workgroup_size_z; + + wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi); + wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi); + wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi); context->kernel (context->kernarg_addr, &wi, group_base_ptr, group_local_offset, private_base_ptr); @@ -564,15 +602,15 @@ __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context) default: case 0: /* Overflow semantics in the case of WG dim > grid dim. */ - id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x) + id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x) % dp->grid_size_x; break; case 1: - id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y) + id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y) % dp->grid_size_y; break; case 2: - id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z) + id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z) % dp->grid_size_z; break; } @@ -590,15 +628,15 @@ __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context) default: case 0: /* Overflow semantics in the case of WG dim > grid dim. */ - id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x) + id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x) % dp->grid_size_x; break; case 1: - id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y) + id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y) % dp->grid_size_y; break; case 2: - id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z) + id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z) % dp->grid_size_z; break; } @@ -738,19 +776,19 @@ __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi) { default: case 0: - if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x) + if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x) wg_size = dp->workgroup_size_x; /* Full WG. */ else wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */ break; case 1: - if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y) + if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y) wg_size = dp->workgroup_size_y; /* Full WG. */ else wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */ break; case 2: - if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z) + if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z) wg_size = dp->workgroup_size_z; /* Full WG. */ else wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */ @@ -798,11 +836,11 @@ __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi) { default: case 0: - return wi->wg->x; + return wi->group_x; case 1: - return wi->wg->y; + return wi->group_y; case 2: - return wi->wg->z; + return wi->group_z; } } |