diff options
Diffstat (limited to 'gprofng/common/hwcdrv.c')
-rw-r--r-- | gprofng/common/hwcdrv.c | 1454 |
1 files changed, 1454 insertions, 0 deletions
diff --git a/gprofng/common/hwcdrv.c b/gprofng/common/hwcdrv.c new file mode 100644 index 0000000..caab983 --- /dev/null +++ b/gprofng/common/hwcdrv.c @@ -0,0 +1,1454 @@ +/* Copyright (C) 2021 Free Software Foundation, Inc. + Contributed by Oracle. + + This file is part of GNU Binutils. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, 51 Franklin Street - Fifth Floor, Boston, + MA 02110-1301, USA. */ + +#include <errno.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> +#include <linux/perf_event.h> + +#include "hwcdrv.h" + +/*---------------------------------------------------------------------------*/ +/* macros */ +#define IS_GLOBAL /* Mark global symbols */ + +#include "cpuid.c" /* ftns for identifying a chip */ + +static hdrv_pcbe_api_t hdrv_pcbe_core_api; +static hdrv_pcbe_api_t hdrv_pcbe_opteron_api; +static hdrv_pcbe_api_t *hdrv_pcbe_drivers[] = { + &hdrv_pcbe_core_api, + &hdrv_pcbe_opteron_api, + NULL +}; +#include "opteron_pcbe.c" /* CPU-specific code */ +#include "core_pcbe.c" /* CPU-specific code */ + +extern hwcdrv_api_t hwcdrv_pcl_api; +IS_GLOBAL hwcdrv_api_t *hwcdrv_drivers[] = { + &hwcdrv_pcl_api, + NULL +}; + +/*---------------------------------------------------------------------------*/ + +/* utils for drivers */ +IS_GLOBAL int +hwcdrv_assign_all_regnos (Hwcentry* entries[], unsigned numctrs) +{ + unsigned int pmc_assigned[MAX_PICS]; + unsigned idx; + for (int ii = 0; ii < MAX_PICS; ii++) + pmc_assigned[ii] = 0; + + /* assign the HWCs that we already know about */ + for (idx = 0; idx < numctrs; idx++) + { + regno_t regno = entries[idx]->reg_num; + if (regno == REGNO_ANY) + { + /* check to see if list of possible registers only contains one entry */ + regno = REG_LIST_SINGLE_VALID_ENTRY (entries[idx]->reg_list); + } + if (regno != REGNO_ANY) + { + if (regno < 0 || regno >= MAX_PICS || !regno_is_valid (entries[idx], regno)) + { + logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/ + return HWCFUNCS_ERROR_HWCARGS; + } + TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): preselected: idx=%d, regno=%d\n", idx, regno); + entries[idx]->reg_num = regno; /* assigning back to entries */ + pmc_assigned[regno] = 1; + } + } + + /* assign HWCs that are currently REGNO_ANY */ + for (idx = 0; idx < numctrs; idx++) + { + if (entries[idx]->reg_num == REGNO_ANY) + { + int assigned = 0; + regno_t *reg_list = entries[idx]->reg_list; + for (; reg_list && *reg_list != REGNO_ANY; reg_list++) + { + regno_t regno = *reg_list; + if (regno < 0 || regno >= MAX_PICS) + { + logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/ + return HWCFUNCS_ERROR_HWCARGS; + } + if (pmc_assigned[regno] == 0) + { + TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): assigned: idx=%d, regno=%d\n", idx, regno); + entries[idx]->reg_num = regno; /* assigning back to entries */ + pmc_assigned[regno] = 1; + assigned = 1; + break; + } + } + if (!assigned) + { + logerr (GTXT ("Counter '%s' could not be bound to a register\n"), + entries[idx]->name ? entries[idx]->name : "<NULL>"); + return HWCFUNCS_ERROR_HWCARGS; + } + } + } + return 0; +} + +IS_GLOBAL int +hwcdrv_lookup_cpuver (const char * cpcN_cciname) +{ + libcpc2_cpu_lookup_t *plookup; + static libcpc2_cpu_lookup_t cpu_table[] = { + LIBCPC2_CPU_LOOKUP_LIST + }; + if (cpcN_cciname == NULL) + return CPUVER_UNDEFINED; + + /* search table for name */ + for (plookup = cpu_table; plookup->cpc2_cciname; plookup++) + { + int n = strlen (plookup->cpc2_cciname); + if (!strncmp (plookup->cpc2_cciname, cpcN_cciname, n)) + return plookup->cpc2_cpuver; + } + /* unknown, but does have a descriptive string */ + TprintfT (DBG_LT0, "hwcfuncs: CPC2: WARNING: Id of processor '%s' " + "could not be determined\n", + cpcN_cciname); + return CPUVER_GENERIC; +} + +/*---------------------------------------------------------------------------*/ +/* utils to generate x86 register definitions on Linux */ + +/* + * This code is structured as though we're going to initialize the + * HWC by writing the Intel MSR register directly. That is, we + * assume the lowest 16 bits of the event number will have the event + * and that higher bits will set attributes. + * + * While SPARC is different, we can nonetheless use basically the + * same "x86"-named functions: + * + * - The event code will still be 16 bits. It will still + * be in the lowest 16 bits of the event number. Though + * perf_event_code() on SPARC will expect those bits to + * shifted, hwcdrv_pcl.c can easily perform that shift. + * + * - On SPARC we support only two attributes, "user" and "system", + * which hwcdrv_pcl.c already converts to the "exclude_user" + * and "exclude_kernel" fields expected by perf_event_open(). + * "user" and "system" are stored in event bits 16 and 17. + * For M8, a 4-bit mask of supported PICs is stored in bits [23:20]. + */ + +IS_GLOBAL hwcdrv_get_eventnum_fn_t *hwcdrv_get_x86_eventnum = 0; + +static const attr_info_t perfctr_sparc_attrs[] = { + {NTXT ("user"), 0, 0x01, 16}, //usr + {NTXT ("system"), 0, 0x01, 17}, //os + {NULL, 0, 0x00, 0}, +}; +static const attr_info_t perfctr_x64_attrs[] = {/* ok for Core2 & later */ + {NTXT ("umask"), 0, 0xff, 8}, + {NTXT ("user"), 0, 0x01, 16}, //usr + //{NTXT("nouser"), 1, 0x01, 16}, //usr (inverted) + {NTXT ("system"), 0, 0x01, 17}, //os + {NTXT ("edge"), 0, 0x01, 18}, + {NTXT ("pc"), 0, 0x01, 19}, + {NTXT ("inv"), 0, 0x01, 23}, + {NTXT ("cmask"), 0, 0xff, 24}, + {NULL, 0, 0x00, 0}, +}; +const attr_info_t *perfctr_attrs_table = perfctr_x64_attrs; + +static const eventsel_t perfctr_evntsel_enable_bits = (0x01 << 16) | /* usr */ + // (0xff << 0) | /* event*/ + // (0xff << 8) | /* umask */ + // (0x01 << 17) | /* os */ + // (0x01 << 18) | /* edge */ + // (0x01 << 19) | /* pc */ + (0x01 << 20) | /* int */ + // (0x01 << 21) | /* reserved */ + (0x01 << 22) | /* enable */ + // (0x01 << 23) | /* inv */ + // (0xff << 24) | /* cmask */ + 0; + +static int +myperfctr_get_x86_eventnum (const char *eventname, uint_t pmc, + eventsel_t *eventsel, eventsel_t *valid_umask, + uint_t *pmc_sel) +{ + if (hwcdrv_get_x86_eventnum && + !hwcdrv_get_x86_eventnum (eventname, pmc, eventsel, valid_umask, pmc_sel)) + return 0; + + /* check for numerically-specified counters */ + char * endptr; + uint64_t num = strtoull (eventname, &endptr, 0); + if (*eventname && !*endptr) + { + *eventsel = EXTENDED_EVNUM_2_EVSEL (num); + *valid_umask = 0xff; /* allow any umask (unused for SPARC?) */ + *pmc_sel = pmc; + return 0; + } + + /* name does not specify a numeric value */ + *eventsel = (eventsel_t) - 1; + *valid_umask = 0x0; + *pmc_sel = pmc; + return -1; +} + +static int +mask_shift_set (eventsel_t *presult, eventsel_t invalue, + eventsel_t mask, eventsel_t shift) +{ + if (invalue & ~mask) + return -1; /* invalue attempts to set bits outside of mask */ + *presult &= ~(mask << shift); /* clear all the mask bits */ + *presult |= (invalue << shift); /* set bits according to invalue */ + return 0; +} + +static int +set_x86_attr_bits (eventsel_t *result_mask, eventsel_t evnt_valid_umask, + hwcfuncs_attr_t attrs[], int nattrs, const char*nameOnly) +{ + eventsel_t evntsel = *result_mask; + for (int ii = 0; ii < (int) nattrs; ii++) + { + const char *attrname = attrs[ii].ca_name; + eventsel_t attrval = (eventsel_t) attrs[ii].ca_val; + const char *tmpname; + int attr_found = 0; + for (int jj = 0; (tmpname = perfctr_attrs_table[jj].attrname); jj++) + { + if (strcmp (attrname, tmpname) == 0) + { + if (strcmp (attrname, "umask") == 0) + { + if (attrval & ~evnt_valid_umask) + { + logerr (GTXT ("for `%s', allowable umask bits are: 0x%llx\n"), + nameOnly, (long long) evnt_valid_umask); + return -1; + } + } + if (mask_shift_set (&evntsel, + perfctr_attrs_table[jj].is_inverted ? (attrval^1) : attrval, + perfctr_attrs_table[jj].mask, + perfctr_attrs_table[jj].shift)) + { + logerr (GTXT ("`%s' attribute `%s' could not be set to 0x%llx\n"), + nameOnly, attrname, (long long) attrval); + return -1; + } + TprintfT (DBG_LT2, "hwcfuncs: Counter %s, attribute %s set to 0x%llx\n", + nameOnly, attrname, (long long) attrval); + attr_found = 1; + break; + } + } + if (!attr_found) + { + logerr (GTXT ("attribute `%s' is invalid\n"), attrname); + return -1; + } + } + *result_mask = evntsel; + return 0; +} + +IS_GLOBAL int +hwcfuncs_get_x86_eventsel (unsigned int regno, const char *int_name, + eventsel_t *return_event, uint_t *return_pmc_sel) +{ + hwcfuncs_attr_t attrs[HWCFUNCS_MAX_ATTRS + 1]; + unsigned nattrs = 0; + char *nameOnly = NULL; + eventsel_t evntsel = 0; // event number + eventsel_t evnt_valid_umask = 0; + uint_t pmc_sel = 0; + int rc = -1; + *return_event = 0; + *return_pmc_sel = 0; + void *attr_mem = hwcfuncs_parse_attrs (int_name, attrs, HWCFUNCS_MAX_ATTRS, + &nattrs, NULL); + if (!attr_mem) + { + logerr (GTXT ("out of memory, could not parse attributes\n")); + return -1; + } + hwcfuncs_parse_ctr (int_name, NULL, &nameOnly, NULL, NULL, NULL); + if (regno == REGNO_ANY) + { + logerr (GTXT ("reg# could not be determined for `%s'\n"), nameOnly); + goto attr_wrapup; + } + + /* look up evntsel */ + if (myperfctr_get_x86_eventnum (nameOnly, regno, + &evntsel, &evnt_valid_umask, &pmc_sel)) + { + logerr (GTXT ("counter `%s' is not valid\n"), nameOnly); + goto attr_wrapup; + } + TprintfT (DBG_LT1, "hwcfuncs: event=0x%llx pmc=0x%x '%s' nattrs = %u\n", + (long long) evntsel, pmc_sel, nameOnly, nattrs); + + /* determine event attributes */ + eventsel_t evnt_attrs = perfctr_evntsel_enable_bits; + if (set_x86_attr_bits (&evnt_attrs, evnt_valid_umask, attrs, nattrs, nameOnly)) + goto attr_wrapup; + if (evntsel & evnt_attrs) + TprintfT (DBG_LT0, "hwcfuncs: ERROR - evntsel & enable bits overlap: 0x%llx 0x%llx 0x%llx\n", + (long long) evntsel, (long long) evnt_attrs, + (long long) (evntsel & evnt_attrs)); + *return_event = evntsel | evnt_attrs; + *return_pmc_sel = pmc_sel; + rc = 0; + +attr_wrapup: + free (attr_mem); + free (nameOnly); + return rc; +} + +#ifdef __x86_64__ +#define syscall_instr "syscall" +#define syscall_clobber "rcx", "r11", "memory" +#endif +#ifdef __i386__ +#define syscall_instr "int $0x80" +#define syscall_clobber "memory" +#endif + +static inline int +perf_event_open (struct perf_event_attr *hw_event_uptr, pid_t pid, + int cpu, int group_fd, unsigned long flags) +{ + /* It seems that perf_event_open() sometimes fails spuriously, + * even while an immediate retry succeeds. + * So, let's try a few retries if the call fails just to be sure. + */ + int rc; + for (int retry = 0; retry < 5; retry++) + { + rc = syscall (__NR_perf_event_open, hw_event_uptr, pid, cpu, group_fd, flags); + if (rc != -1) + return rc; + } + return rc; +} + +/*---------------------------------------------------------------------------*/ +/* macros & fwd prototypes */ + +#define HWCDRV_API static /* Mark functions used by hwcdrv API */ + +HWCDRV_API int hwcdrv_start (void); +HWCDRV_API int hwcdrv_free_counters (); + +static pid_t +hwcdrv_gettid (void) +{ +#ifndef LIBCOLLECTOR_SRC + return syscall (__NR_gettid); +#elif defined(intel) + pid_t r; + __asm__ __volatile__(syscall_instr + : "=a" (r) : "0" (__NR_gettid) + : syscall_clobber); + return r; +#else + return syscall (__NR_gettid); // FIXUP_XXX_SPARC_LINUX // write gettid in asm +#endif +} + +/*---------------------------------------------------------------------------*/ +/* types */ + +#define NPAGES_PER_BUF 1 // number of pages to be used for perf_event samples +// must be a power of 2 + +/*---------------------------------------------------------------------------*/ + +/* typedefs */ + +typedef struct +{ // event (hwc) definition + unsigned int reg_num; // PMC assignment, potentially for detecting conflicts + eventsel_t eventsel; // raw event bits (Intel/AMD) + uint64_t counter_preload; // number of HWC events before signal + struct perf_event_attr hw; // perf_event definition + hrtime_t min_time; // minimum time we're targeting between events + char *name; +} perf_event_def_t; + +typedef struct +{ // runtime state of perf_event buffer + void *buf; // pointer to mmapped buffer + size_t pagesz; // size of pages +} buffer_state_t; + +typedef struct +{ // runtime state of counter values + uint64_t prev_ena_ts; // previous perf_event "enabled" time + uint64_t prev_run_ts; // previous perf_event "running" time + uint64_t prev_value; // previous HWC value +} counter_value_state_t; + +typedef struct +{ // per-counter information + perf_event_def_t *ev_def; // global HWC definition for one counter + int fd; // perf_event fd + buffer_state_t buf_state; // perf_event buffer's state + counter_value_state_t value_state; // counter state + int needs_restart; // workaround for dbx failure to preserve si_fd + uint64_t last_overflow_period; + hrtime_t last_overflow_time; +} counter_state_t; + +typedef struct +{ // per-thread context + counter_state_t *ctr_list; + int signal_fd; // fd that caused the most recent signal + pthread_t tid; // for debugging signal delivery problems +} hdrv_pcl_ctx_t; + +/*---------------------------------------------------------------------------*/ + +/* static variables */ +static struct +{ + int library_ok; + int internal_open_called; + hwcfuncs_tsd_get_fn_t find_vpc_ctx; + unsigned hwcdef_cnt; /* number of *active* hardware counters */ + hwcdrv_get_events_fn_t *get_events; +} hdrv_pcl_state; + +static hwcdrv_about_t hdrv_pcl_about = {.cpcN_cpuver = CPUVER_UNDEFINED}; +static perf_event_def_t global_perf_event_def[MAX_PICS]; + +#define COUNTERS_ENABLED() (hdrv_pcl_state.hwcdef_cnt) + + +/* perf_event buffer formatting and handling */ +static void +reset_buf (buffer_state_t *bufstate) +{ + TprintfT (0, "hwcdrv: ERROR: perf_event reset_buf() called!\n"); + struct perf_event_mmap_page *metadata = bufstate->buf; + if (metadata) + metadata->data_tail = metadata->data_head; +} + +static int +skip_buf (buffer_state_t *bufstate, size_t sz) +{ + TprintfT (DBG_LT1, "hwcdrv: WARNING: perf_event skip_buf called!\n"); + struct perf_event_mmap_page *metadata = bufstate->buf; + if (metadata == NULL) + return -1; + size_t pgsz = bufstate->pagesz; + size_t bufsz = NPAGES_PER_BUF*pgsz; + uint64_t d_tail = metadata->data_tail; + uint64_t d_head = metadata->data_head; + + // validate request size + if (sz > d_head - d_tail || sz >= bufsz) + { + reset_buf (bufstate); + return -1; + } + metadata->data_tail = d_tail + sz; // advance tail + return 0; +} + +static int +read_buf (buffer_state_t *bufstate, void *buf, size_t sz) +{ + struct perf_event_mmap_page *metadata = bufstate->buf; + if (metadata == NULL) + return -1; + size_t pgsz = bufstate->pagesz; + size_t bufsz = NPAGES_PER_BUF*pgsz; + uint64_t d_tail = metadata->data_tail; + uint64_t d_head = metadata->data_head; + + // validate request size + if (sz > d_head - d_tail || sz >= bufsz) + { + reset_buf (bufstate); + return -1; + } + char *buf_base = ((char *) metadata) + pgsz; // start of data buffer + uint64_t start_pos = d_tail & (bufsz - 1); // char offset into data buffer + size_t nbytes = sz; + if (start_pos + sz > bufsz) + { + // will wrap past end of buffer + nbytes = bufsz - start_pos; + memcpy (buf, buf_base + start_pos, nbytes); + start_pos = 0; // wrap to start + buf = (void *) (((char *) buf) + nbytes); + nbytes = sz - nbytes; + } + memcpy (buf, buf_base + start_pos, nbytes); + metadata->data_tail += sz; + return 0; +} + +static int +read_u64 (buffer_state_t *bufstate, uint64_t *value) +{ + return read_buf (bufstate, value, sizeof (uint64_t)); +} + +static int +read_sample (counter_state_t *ctr_state, int msgsz, uint64_t *rvalue, + uint64_t *rlost) +{ + // returns count of bytes read + buffer_state_t *bufstate = &ctr_state->buf_state; + counter_value_state_t *cntstate = &ctr_state->value_state; + int readsz = 0; + + // PERF_SAMPLE_IP + uint64_t ipc = 0; + int rc = read_u64 (bufstate, &ipc); + if (rc) + return -1; + readsz += sizeof (uint64_t); + + // PERF_SAMPLE_READ: value + uint64_t value = 0; + rc = read_u64 (bufstate, &value); + if (rc) + return -2; + readsz += sizeof (uint64_t); + + /* Bug 20806896 + * Old Linux kernels (e.g. 2.6.32) on certain systems return enabled and + * running times in the sample data that correspond to the metadata times + * metadata->time_enabled + * metadata->time_running + * from the PREVIOUS (not current) sample. Probably just ignore this bug + * since it's on old kernels and we only use the enabled and running times + * to construct loss_estimate. + */ + // PERF_SAMPLE_READ: PERF_FORMAT_ENABLED + uint64_t enabled_time = 0; + rc = read_u64 (bufstate, &enabled_time); + if (rc) + return -3; + readsz += sizeof (uint64_t); + + // PERF_SAMPLE_READ: PERF_FORMAT_RUNNING + uint64_t running_time = 0; + rc = read_u64 (bufstate, &running_time); + if (rc) + return -4; + readsz += sizeof (uint64_t); + + uint64_t value_delta = value - cntstate->prev_value; + uint64_t enabled_delta = enabled_time - cntstate->prev_ena_ts; + uint64_t running_delta = running_time - cntstate->prev_run_ts; + cntstate->prev_value = value; + cntstate->prev_ena_ts = enabled_time; + cntstate->prev_run_ts = running_time; + + // 24830461 need workaround for Linux anomalous HWC skid overrun + int set_error_flag = 0; + if (value_delta > 2 * ctr_state->last_overflow_period + 2000 /* HWC_SKID_TOLERANCE */) + set_error_flag = 1; + + uint64_t loss_estimate = 0; // estimate loss of events caused by multiplexing + if (running_delta == enabled_delta) + { + // counter was running 100% of time, no multiplexing + } + else if (running_delta == 0) + loss_estimate = 1; // token amount to aid in debugging perfctr oddities + else if ((running_delta > enabled_delta) || (enabled_delta & 0x1000000000000000ll)) + { + // running should be smaller than enabled, can't estimate + /* + * 21418391 HWC can have a negative count + * + * We've also seen enabled not only be smaller than running + * but in fact go negative. Guard against this. + */ + loss_estimate = 2; // token amount to aid in debugging perfctr oddities + } + else + { + // counter was running less than 100% of time + // Example: ena=7772268 run=6775669 raw_value=316004 scaled_value=362483 loss_est=46479 + uint64_t scaled_delta = (double) value_delta * enabled_delta / running_delta; + value_delta = scaled_delta; +#if 0 + // We should perhaps warn the user that multiplexing is going on, + // but hwcdrv_pcl.c doesn't know about the collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_* values. + // For now we simply don't report. + // Perhaps we should address the issue not here but in the caller collector_sigemt_handler(), + // but at that level "lost" has a meaning that's considerably broader than just multiplexing. + collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n", + SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name, + ctr_list[idx].last_overflow_period, new_period); +#endif + } + TprintfT ((loss_estimate || set_error_flag) ? DBG_LT1 : DBG_LT3, + "hwcdrv: '%s' ipc=0x%llx ena=%llu run=%llu " + "value_delta=%lld(0x%llx) loss_est=%llu %s error_flag='%s'\n", + ctr_state->ev_def->name, (long long) ipc, + (long long) enabled_delta, (long long) running_delta, + (long long) value_delta, (long long) value_delta, + (unsigned long long) loss_estimate, + loss_estimate ? ", WARNING - SCALED" : "", + set_error_flag ? ", ERRORFLAG" : ""); + if (set_error_flag == 1) + value_delta |= (1ULL << 63) /* HWCVAL_ERR_FLAG */; + *rvalue = value_delta; + *rlost = loss_estimate; + if (readsz != msgsz) + { + TprintfT (0, "hwcdrv: ERROR: perf_event sample not fully parsed\n"); + return -5; + } + return 0; +} + +static void +dump_perf_event_attr (struct perf_event_attr *at) +{ + TprintfT (DBG_LT2, "dump_perf_event_attr: size=%d type=%d sample_period=%lld\n" + " config=0x%llx config1=0x%llx config2=0x%llx wakeup_events=%lld __reserved_1=%lld\n", + (int) at->size, (int) at->type, (unsigned long long) at->sample_period, + (unsigned long long) at->config, (unsigned long long) at->config1, + (unsigned long long) at->config2, (unsigned long long) at->wakeup_events, + (unsigned long long) at->__reserved_1); +#define DUMP_F(fld) if (at->fld) TprintfT(DBG_LT2, " %-10s : %lld\n", #fld, (long long) at->fld) + DUMP_F (disabled); + DUMP_F (inherit); + DUMP_F (pinned); + DUMP_F (exclusive); + DUMP_F (exclude_user); + DUMP_F (exclude_kernel); + DUMP_F (exclude_hv); + DUMP_F (exclude_idle); + // DUMP_F(xmmap); + DUMP_F (comm); + DUMP_F (freq); + DUMP_F (inherit_stat); + DUMP_F (enable_on_exec); + DUMP_F (task); + DUMP_F (watermark); +} + +static void +init_perf_event (struct perf_event_attr *hw, uint64_t event, uint64_t period) +{ + memset (hw, 0, sizeof (struct perf_event_attr)); + hw->size = sizeof (struct perf_event_attr); // fwd/bwd compat + +#if defined(__i386__) || defined(__x86_64) + //note: Nehalem/Westmere OFFCORE_RESPONSE in upper 32 bits + hw->config = event; + hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw... +#elif defined(__aarch64__) + hw->type = (event >> 24) & 7; + hw->config = event & 0xff; +#elif defined(sparc) + //SPARC needs to be shifted up 16 bits + hw->config = (event & 0xFFFF) << 16; // uint64_t event + uint64_t regs = (event >> 20) & 0xf; // see sparc_pcbe.c + hw->config |= regs << 4; // for M8, supported PICs need to be placed at bits [7:4] + hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw... +#endif + + hw->sample_period = period; + hw->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_READ | + // PERF_SAMPLE_TID | + // PERF_SAMPLE_TIME | // possibly interesting + // PERF_SAMPLE_ADDR | + PERF_SAMPLE_READ | // HWC value + // PERF_SAMPLE_CALLCHAIN | // interesting + // PERF_SAMPLE_ID | + // PERF_SAMPLE_CPU | // possibly interesting + // PERF_SAMPLE_PERIOD | + // PERF_SAMPLE_STREAM_ID | + // PERF_SAMPLE_RAW | + 0; + hw->read_format = + PERF_FORMAT_TOTAL_TIME_ENABLED | // detect when hwc not scheduled + PERF_FORMAT_TOTAL_TIME_RUNNING | // detect when hwc not scheduled + // PERF_FORMAT_ID | + // PERF_FORMAT_GROUP | + 0; + hw->disabled = 1; /* off by default */ + + // Note: the following override config.priv bits! + hw->exclude_user = (event & (1 << 16)) == 0; /* don't count user */ + hw->exclude_kernel = (event & (1 << 17)) == 0; /* ditto kernel */ + hw->exclude_hv = 1; /* ditto hypervisor */ + hw->wakeup_events = 1; /* wakeup every n events */ + dump_perf_event_attr (hw); +} + +static int +start_one_ctr (int ii, size_t pgsz, hdrv_pcl_ctx_t * pctx, char *error_string) +{ + // pe_attr should have been initialized in hwcdrv_create_counters() + struct perf_event_attr pe_attr; + memcpy (&pe_attr, &global_perf_event_def[ii].hw, sizeof (pe_attr)); + + // but we adjust the period, so make sure that pctx->ctr_list[ii].last_overflow_period has been set + pe_attr.sample_period = pctx->ctr_list[ii].last_overflow_period; + + int hwc_fd = perf_event_open (&pe_attr, pctx->tid, -1, -1, 0); + if (hwc_fd == -1) + { + TprintfT (DBG_LT1, "%s idx=%d perf_event_open failed, errno=%d\n", + error_string, ii, errno); + return 1; + } + + size_t buffer_area_sz = (NPAGES_PER_BUF + 1) * pgsz; // add a page for metadata + void * buf = mmap (NULL, buffer_area_sz, //YXXX is this a safe call? + PROT_READ | PROT_WRITE, MAP_SHARED, hwc_fd, 0); + if (buf == MAP_FAILED) + { + TprintfT (0, "sz = %ld, pgsz = %ld\n err=%s idx=%d mmap failed: %s\n", + (long) buffer_area_sz, (long) pgsz, error_string, ii, strerror (errno)); + return 1; + } + pctx->ctr_list[ii].ev_def = &global_perf_event_def[ii]; // why do we set ev_def? we never seem to use it + pctx->ctr_list[ii].fd = hwc_fd; + pctx->ctr_list[ii].buf_state.buf = buf; + pctx->ctr_list[ii].buf_state.pagesz = pgsz; + pctx->ctr_list[ii].value_state.prev_ena_ts = 0; + pctx->ctr_list[ii].value_state.prev_run_ts = 0; + pctx->ctr_list[ii].value_state.prev_value = 0; + pctx->ctr_list[ii].last_overflow_time = gethrtime (); + + /* set async mode */ + long flags = fcntl (hwc_fd, F_GETFL, 0) | O_ASYNC; + int rc = fcntl (hwc_fd, F_SETFL, flags); + if (rc == -1) + { + TprintfT (0, "%s idx=%d O_ASYNC failed\n", error_string, ii); + return 1; + } + + /* + * set lwp ownership of the fd + * See BUGS section of "man perf_event_open": + * The F_SETOWN_EX option to fcntl(2) is needed to properly get + * overflow signals in threads. This was introduced in Linux 2.6.32. + * Legacy references: + * see http://lkml.org/lkml/2009/8/4/128 + * google man fcntl F_SETOWN_EX -conflict + * "From Linux 2.6.32 onward, use F_SETOWN_EX to target + * SIGIO and SIGURG signals at a particular thread." + * http://icl.cs.utk.edu/papi/docs/da/d2a/examples__v2_8x_2self__smpl__multi_8c.html + * See 2010 CSCADS presentation by Eranian + */ + struct f_owner_ex fowner_ex; + fowner_ex.type = F_OWNER_TID; + fowner_ex.pid = pctx->tid; + rc = fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex); + if (rc == -1) + { + TprintfT (0, "%s idx=%d F_SETOWN failed\n", error_string, ii); + return 1; + } + + /* Use sigio so handler can determine FD via siginfo->si_fd. */ + rc = fcntl (hwc_fd, F_SETSIG, SIGIO); + if (rc == -1) + { + TprintfT (0, "%s idx=%d F_SETSIG failed\n", error_string, ii); + return 1; + } + return 0; +} + +static int +stop_one_ctr (int ii, counter_state_t *ctr_list) +{ + int hwc_rc = 0; + if (-1 == ioctl (ctr_list[ii].fd, PERF_EVENT_IOC_DISABLE, 1)) + { + TprintfT (0, "hwcdrv: ERROR: PERF_EVENT_IOC_DISABLE #%d failed: errno=%d\n", ii, errno); + hwc_rc = HWCFUNCS_ERROR_GENERIC; + } + void *buf = ctr_list[ii].buf_state.buf; + if (buf) + { + size_t bufsz = (NPAGES_PER_BUF + 1) * ctr_list[ii].buf_state.pagesz; + ctr_list[ii].buf_state.buf = NULL; + int tmprc = munmap (buf, bufsz); + if (tmprc) + { + TprintfT (0, "hwcdrv: ERROR: munmap() #%d failed: errno=%d\n", ii, errno); + hwc_rc = HWCFUNCS_ERROR_GENERIC; + } + } + if (-1 == close (ctr_list[ii].fd)) + { + TprintfT (0, "hwcdrv: ERROR: close(fd) #%d failed: errno=%d\n", ii, errno); + hwc_rc = HWCFUNCS_ERROR_GENERIC; + } + return hwc_rc; +} + +/* HWCDRV_API for thread-specific actions */ +HWCDRV_API int +hwcdrv_lwp_init (void) +{ + return hwcdrv_start (); +} + +HWCDRV_API void +hwcdrv_lwp_fini (void) +{ + hwcdrv_free_counters (); /* also sets pctx->ctr_list=NULL; */ +} + +/* open */ +static int +hdrv_pcl_internal_open () +{ + if (hdrv_pcl_state.internal_open_called) + { + TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open: already called\n"); + return HWCFUNCS_ERROR_ALREADY_CALLED; + } + + // determine if PCL is available + perf_event_def_t tmp_event_def; + memset (&tmp_event_def, 0, sizeof (tmp_event_def)); + struct perf_event_attr *pe_attr = &tmp_event_def.hw; + init_perf_event (pe_attr, 0, 0); + pe_attr->type = PERF_TYPE_HARDWARE; // specify abstracted HW event + pe_attr->config = PERF_COUNT_HW_INSTRUCTIONS; // specify abstracted insts + int hwc_fd = perf_event_open (pe_attr, + 0, // pid/tid, 0 is self + -1, // cpu, -1 is per-thread mode + -1, // group_fd, -1 is root + 0); // flags + if (hwc_fd == -1) + { + TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open:" + " perf_event_open() failed, errno=%d\n", errno); + goto internal_open_error; + } + + /* see if the PCL is new enough to know about F_SETOWN_EX */ + struct f_owner_ex fowner_ex; + fowner_ex.type = F_OWNER_TID; + fowner_ex.pid = hwcdrv_gettid (); // "pid=tid" is correct w/F_OWNER_TID + if (fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex) == -1) + { + TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open: " + "F_SETOWN failed, errno=%d\n", errno); + close (hwc_fd); + goto internal_open_error; + } + close (hwc_fd); + + hdrv_pcl_state.internal_open_called = 1; + hdrv_pcl_state.library_ok = 1; // set to non-zero to show it's initted + hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED; + TprintfT (DBG_LT2, "hwcdrv: hdrv_pcl_internal_open()\n"); + for (int ii = 0; hdrv_pcbe_drivers[ii]; ii++) + { + hdrv_pcbe_api_t *ppcbe = hdrv_pcbe_drivers[ii]; + if (!ppcbe->hdrv_pcbe_init ()) + { + hdrv_pcl_about.cpcN_cciname = ppcbe->hdrv_pcbe_impl_name (); + hdrv_pcl_about.cpcN_cpuver = hwcdrv_lookup_cpuver (hdrv_pcl_about.cpcN_cciname); + if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED) + goto internal_open_error; + hdrv_pcl_about.cpcN_npics = ppcbe->hdrv_pcbe_ncounters (); + hdrv_pcl_about.cpcN_docref = ppcbe->hdrv_pcbe_cpuref (); + hdrv_pcl_state.get_events = ppcbe->hdrv_pcbe_get_events; + hwcdrv_get_x86_eventnum = ppcbe->hdrv_pcbe_get_eventnum; + break; + } + } + if (hdrv_pcl_about.cpcN_npics > MAX_PICS) + { + TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open:" + " reducing number of HWCs from %u to %u on processor '%s'\n", + hdrv_pcl_about.cpcN_npics, MAX_PICS, hdrv_pcl_about.cpcN_cciname); + hdrv_pcl_about.cpcN_npics = MAX_PICS; + } + TprintfT (DBG_LT1, "hwcdrv: hdrv_pcl_internal_open:" + " perf_event cpuver=%d, name='%s'\n", + hdrv_pcl_about.cpcN_cpuver, hdrv_pcl_about.cpcN_cciname); + return 0; + +internal_open_error: + hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED; + hdrv_pcl_about.cpcN_npics = 0; + hdrv_pcl_about.cpcN_docref = NULL; + hdrv_pcl_about.cpcN_cciname = NULL; + return HWCFUNCS_ERROR_NOT_SUPPORTED; +} + +static void * +single_thread_tsd_ftn () +{ + static hdrv_pcl_ctx_t tsd_context; + return &tsd_context; +} + +/* HWCDRV_API */ +HWCDRV_API int +hwcdrv_init (hwcfuncs_abort_fn_t abort_ftn, int *tsd_sz) +{ + hdrv_pcl_state.find_vpc_ctx = single_thread_tsd_ftn; + if (tsd_sz) + *tsd_sz = sizeof (hdrv_pcl_ctx_t); + + if (hdrv_pcl_state.internal_open_called) + return HWCFUNCS_ERROR_ALREADY_CALLED; + return hdrv_pcl_internal_open (); +} + +HWCDRV_API void +hwcdrv_get_info (int *cpuver, const char **cciname, uint_t *npics, + const char **docref, uint64_t *support) +{ + if (cpuver) + *cpuver = hdrv_pcl_about.cpcN_cpuver; + if (cciname) + *cciname = hdrv_pcl_about.cpcN_cciname; + if (npics) + *npics = hdrv_pcl_about.cpcN_npics; + if (docref) + *docref = hdrv_pcl_about.cpcN_docref; + if (support) + *support = HWCFUNCS_SUPPORT_OVERFLOW_PROFILING | HWCFUNCS_SUPPORT_OVERFLOW_CTR_ID; +} + +HWCDRV_API int +hwcdrv_enable_mt (hwcfuncs_tsd_get_fn_t tsd_ftn) +{ + if (tsd_ftn) + hdrv_pcl_state.find_vpc_ctx = tsd_ftn; + else + { + TprintfT (0, "hwcdrv: ERROR: enable_mt(): tsd_ftn==NULL\n"); + return HWCFUNCS_ERROR_UNAVAIL; + } + return 0; +} + +HWCDRV_API int +hwcdrv_get_descriptions (hwcf_hwc_cb_t *hwc_cb, hwcf_attr_cb_t *attr_cb) +{ + int count = 0; + if (hwc_cb && hdrv_pcl_state.get_events) + count = hdrv_pcl_state.get_events (hwc_cb); + if (attr_cb) + for (int ii = 0; perfctr_attrs_table && perfctr_attrs_table[ii].attrname; ii++) + attr_cb (perfctr_attrs_table[ii].attrname); + if (!count) + return -1; + return 0; +} + +HWCDRV_API int +hwcdrv_assign_regnos (Hwcentry* entries[], unsigned numctrs) +{ + return hwcdrv_assign_all_regnos (entries, numctrs); +} + +static int +internal_hwc_start (int fd) +{ + int rc = ioctl (fd, PERF_EVENT_IOC_REFRESH, 1); + if (rc == -1) + { + TprintfT (DBG_LT0, "hwcdrv: ERROR: internal_hwc_start:" + " PERF_EVENT_IOC_REFRESH(fd=%d) failed: errno=%d\n", fd, errno); + return HWCFUNCS_ERROR_UNAVAIL; + } + TprintfT (DBG_LT3, "hwcdrv: internal_hwc_start(fd=%d)\n", fd); + return 0; +} + +HWCDRV_API int +hwcdrv_overflow (siginfo_t *si, hwc_event_t *eventp, hwc_event_t *lost_events) +{ + /* set expired counters to overflow value and all others to 0 */ + /* return 0: OK, counters should be restarted */ + /* return non-zero: eventp not set, counters should not be restarted */ + /* clear return values */ + int ii; + for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) + { + eventp->ce_pic[ii] = 0; + lost_events->ce_pic[ii] = 0; + } + hrtime_t sig_ts = gethrtime (); //YXXX get this from HWC event? + eventp->ce_hrt = sig_ts; + lost_events->ce_hrt = sig_ts; + + /* determine source signal */ + int signal_fd = -1; + switch (si->si_code) + { + case POLL_HUP: /* expected value from pcl */ + /* According to Stephane Eranian: + * "expect POLL_HUP instead of POLL_IN because we are + * in one-shot mode (IOC_REFRESH)" + */ + signal_fd = si->si_fd; + break; + case SI_TKILL: /* event forwarded by tkill */ + /* DBX can only forward SI_TKILL when it detects POLL_HUP + * unfortunately, this means that si->si_fd has been lost... + * We need to process the buffers, but we don't know the fd! + */ + TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" + " SI_TKILL detected\n", sig_ts); + break; + default: + // "sometimes we see a POLL_IN (1) with very high event rates," + // according to eranian(?) + TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" + " unexpected si_code 0x%x\n", sig_ts, si->si_code); + return HWCFUNCS_ERROR_GENERIC; + } + + hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx (); + if (!pctx) + { + TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" + " tsd context is NULL\n", sig_ts); + return HWCFUNCS_ERROR_UNEXPECTED; + } + counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list; + if (!ctr_list) + { + TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" + " ctr_list is NULL\n", sig_ts); + return HWCFUNCS_ERROR_UNEXPECTED; + } + + /* clear needs_restart flag */ + for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) + ctr_list[ii].needs_restart = 0; + + /* attempt to identify the counter to read */ + int signal_idx = -1; + pctx->signal_fd = signal_fd; // save the signal provided by siginfo_t + if (signal_fd != -1) + { + for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) + { + if (ctr_list[ii].fd == signal_fd) + { + signal_idx = ii; + break; + } + } + } + + if (signal_idx < 0) + { + TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" + " pmc not determined!\n", sig_ts); + lost_events->ce_pic[0] = 1; /* record a bogus value into experiment */ + // note: bogus value may get overwritten in loop below + } + + /* capture sample(s). In addition to signal_idx, check other counters. */ + struct perf_event_header sheader; + int idx; + for (idx = 0; idx < hdrv_pcl_state.hwcdef_cnt; idx++) + { + int num_recs = 0; + while (1) + { + /* check for samples */ + struct perf_event_mmap_page *metadata = ctr_list[idx].buf_state.buf; + if (metadata == NULL) + break; // empty + if (metadata->data_tail == metadata->data_head) + break; // empty + + /* read header */ + if (read_buf (&ctr_list[idx].buf_state, &sheader, sizeof (sheader))) + break; + num_recs++; + + /* check for PERF_RECORD_SAMPLE */ + size_t datasz = sheader.size - sizeof (struct perf_event_header); + if (sheader.type != PERF_RECORD_SAMPLE) + { + TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" + " unexpected recd type=%d\n", + sig_ts, sheader.type); + if (skip_buf (&ctr_list[idx].buf_state, datasz)) + { + TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" + " skip recd type=%d failed\n", sig_ts, sheader.type); + lost_events->ce_pic[idx] = 4; /* record a bogus value */ + break; // failed to skip buffer?? + } + lost_events->ce_pic[idx] = 2; /* record a bogus value */ + continue; // advance to next record + } + + /* type is PERF_RECORD_SAMPLE */ + uint64_t value, lostv; + if (read_sample (&ctr_list[idx], datasz, &value, &lostv)) + { + TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:" + " read_sample() failed\n", sig_ts); + lost_events->ce_pic[idx] = 3; // record a bogus value + break; // failed to read sample data?? + } + TprintfT (DBG_LT3, "hwcdrv: sig_ts=%llu: hwcdrv_overflow:" + " idx=%d value=%llu lost=%llu\n", (unsigned long long) sig_ts, + idx, (unsigned long long) value, (unsigned long long) lostv); + if (eventp->ce_pic[idx]) + { + TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" + " idx=%d previous sample recorded as lost_event\n", sig_ts, idx); + lost_events->ce_pic[idx] += eventp->ce_pic[idx]; + } + eventp->ce_pic[idx] = value; + lost_events->ce_pic[idx] += lostv; + } + + /* debug output for unexpected (but common) cases */ + if (idx == signal_idx) + { + if (num_recs != 1) + TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" + " %d records for signal_idx=%d\n", sig_ts, num_recs, signal_idx); + } + else if (num_recs) + TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:" + " %d unexpected record(s) for idx=%d (signal_idx=%d)\n", + sig_ts, num_recs, idx, signal_idx); + + /* trigger counter restart whenever records were found */ + if (num_recs) + { + /* check whether to adapt the overflow interval */ + /* This is the Linux version. + * The Solaris version is in hwprofile.c collector_update_overflow_counters(). + */ + hrtime_t min_time = global_perf_event_def[idx].min_time; + if (min_time > 0 // overflow interval is adaptive + && sig_ts - ctr_list[idx].last_overflow_time < min_time) // last interval below min + { + /* pick a new overflow interval */ + /* roughly doubled, but add funny numbers */ + /* hopefully the result is prime or not a multiple of some # of ops/loop */ + uint64_t new_period = 2 * ctr_list[idx].last_overflow_period + 37; +#if 0 + // On Solaris, we report the adjustment to the log file. + // On Linux it's hard for us to do so since hwcdrv_pcl.c doesn't know about collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_HWCADJ. + // For now we simply don't report. + collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n", + SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name, + ctr_list[idx].last_overflow_period, new_period); +#endif + /* There are a variety of ways of resetting the period on Linux. + * The most elegant is + * ioctl(fd,PERF_EVENT_IOC_PERIOD,&period) + * but check the perf_event_open man page for PERF_EVENT_IOC_PERIOD: + * > Prior to Linux 2.6.36 this ioctl always failed due to a bug in the kernel. + * > Prior to Linux 3.14 (or 3.7 on ARM), the new period did not take effect + * until after the next overflow. + * So we're kind of stuck shutting the fd down and restarting it with the new period. + */ + if (stop_one_ctr (idx, ctr_list)) + { + // EUGENE figure out what to do on error + } + ctr_list[idx].last_overflow_period = new_period; + if (start_one_ctr (idx, ctr_list[idx].buf_state.pagesz, pctx, "hwcdrv: ERROR: hwcdrv_overflow (readjust overflow):")) + { + // EUGENE figure out what to do on error + } + } + ctr_list[idx].last_overflow_time = sig_ts; +#if 0 + ctr_list[idx].needs_restart = 1; +#else // seems to be more reliable to restart here instead of hwcdrv_sighlr_restart() + internal_hwc_start (ctr_list[idx].fd); +#endif + } + } + return 0; // OK to restart counters +} + +HWCDRV_API int +hwcdrv_sighlr_restart (const hwc_event_t *pp) +{ +#if 0 // restarting here doesn't seem to work as well as restarting in hwcdrv_overflow() + hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx (); + if (!pctx) + { + TprintfT (DBG_LT0, "hwcdrv: ERROR: hwcdrv_sighlr_restart: find_vpc_ctx()==NULL\n"); + return -1; + } + counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list; + if (!ctr_list) + { + TprintfT (DBG_LT0, "hwcdrv: WARNING: hwcdrv_sighlr_restart: ctr_list is NULL\n"); + return -1; + } + int errors = 0; + for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) + { + if (ctr_list[ii].needs_restart) + errors |= internal_hwc_start (ctr_list[ii].fd); + ctr_list[ii].needs_restart = 0; + } + return errors; +#else + return 0; +#endif +} + +/* create counters based on hwcdef[] */ +HWCDRV_API int +hwcdrv_create_counters (unsigned hwcdef_cnt, Hwcentry *hwcdef) +{ + if (hwcdef_cnt > hdrv_pcl_about.cpcN_npics) + { + logerr (GTXT ("More than %d counters were specified\n"), hdrv_pcl_about.cpcN_npics); /*!*/ + return HWCFUNCS_ERROR_HWCARGS; + } + if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED) + { + logerr (GTXT ("Processor not supported\n")); + return HWCFUNCS_ERROR_HWCARGS; + } + + /* add counters */ + for (unsigned idx = 0; idx < hwcdef_cnt; idx++) + { + perf_event_def_t *glb_event_def = &global_perf_event_def[idx]; + memset (glb_event_def, 0, sizeof (perf_event_def_t)); + unsigned int pmc_sel; + eventsel_t evntsel; + if (hwcfuncs_get_x86_eventsel (hwcdef[idx].reg_num, + hwcdef[idx].int_name, &evntsel, &pmc_sel)) + { + TprintfT (0, "hwcdrv: ERROR: hwcfuncs_get_x86_eventsel() failed\n"); + return HWCFUNCS_ERROR_HWCARGS; + } + glb_event_def->reg_num = pmc_sel; + glb_event_def->eventsel = evntsel; + glb_event_def->counter_preload = hwcdef[idx].val; + glb_event_def->min_time = hwcdef[idx].min_time; + glb_event_def->name = strdup (hwcdef[idx].name); // memory leak??? very minor + init_perf_event (&glb_event_def->hw, glb_event_def->eventsel, + glb_event_def->counter_preload); + TprintfT (DBG_LT1, "hwcdrv: create_counters: pic=%u name='%s' interval=%lld" + "(min_time=%lld): reg_num=0x%x eventsel=0x%llx ireset=%lld usr=%lld sys=%lld\n", + idx, hwcdef[idx].int_name, (long long) glb_event_def->counter_preload, + (long long) glb_event_def->min_time, (int) glb_event_def->reg_num, + (long long) glb_event_def->eventsel, + (long long) HW_INTERVAL_PRESET (hwcdef[idx].val), + (long long) glb_event_def->hw.exclude_user, + (long long) glb_event_def->hw.exclude_kernel); + } + + hdrv_pcl_state.hwcdef_cnt = hwcdef_cnt; + return 0; +} + +HWCDRV_API int +hwcdrv_free_counters () // note: only performs shutdown for this thread +{ + hdrv_pcl_ctx_t * pctx; + if (!COUNTERS_ENABLED ()) + return 0; + pctx = hdrv_pcl_state.find_vpc_ctx (); + if (!pctx) + { + TprintfT (0, "hwcdrv: WARNING: hwcdrv_free_counters: tsd context is NULL\n"); + return HWCFUNCS_ERROR_GENERIC; + } + counter_state_t *ctr_list = pctx->ctr_list; + if (!ctr_list) + { + // fork child: prolog suspends hwcs, then epilog frees them + TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_free_counters: ctr_list is already NULL\n"); + return 0; + } + int hwc_rc = 0; + for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) + if (stop_one_ctr (ii, ctr_list)) + hwc_rc = HWCFUNCS_ERROR_GENERIC; + TprintfT (DBG_LT1, "hwcdrv: hwcdrv_free_counters(tid=0x%lx).\n", pctx->tid); + pctx->ctr_list = NULL; + return hwc_rc; +} + +HWCDRV_API int +hwcdrv_start (void) /* must be called from each thread ? */ +{ + hdrv_pcl_ctx_t *pctx = NULL; + if (!COUNTERS_ENABLED ()) + { + TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_start: no counters to start \n"); + return 0; + } + if (!hdrv_pcl_state.library_ok) + { + TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: library is not open\n"); + return HWCFUNCS_ERROR_NOT_SUPPORTED; + } + + /* + * set up per-thread context + */ + pctx = hdrv_pcl_state.find_vpc_ctx (); + if (!pctx) + { + TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: tsd context is NULL\n"); + return HWCFUNCS_ERROR_UNEXPECTED; + } + pctx->tid = hwcdrv_gettid (); + TprintfT (DBG_LT1, "hwcdrv: hwcdrv_start(tid=0x%lx)\n", pctx->tid); + + /* + * create per-thread counter list + */ + counter_state_t *ctr_list = (counter_state_t *) calloc (hdrv_pcl_state.hwcdef_cnt, + sizeof (counter_state_t)); + if (!ctr_list) + { + TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: calloc(ctr_list) failed\n"); + return HWCFUNCS_ERROR_MEMORY; + } + int ii; + for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) + ctr_list[ii].fd = -1; // invalidate fds in case we have to close prematurely + pctx->ctr_list = ctr_list; + + /* + * bind the counters + */ + size_t pgsz = sysconf (_SC_PAGESIZE); + for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) + { + ctr_list[ii].last_overflow_period = global_perf_event_def[ii].hw.sample_period; + if (start_one_ctr (ii, pgsz, pctx, "hwcdrv: ERROR: hwcdrv_start:")) goto hwcdrv_start_cleanup; + } + + /* + * start the counters + */ + for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++) + { + int rc = internal_hwc_start (ctr_list[ii].fd); + if (rc < 0) + goto hwcdrv_start_cleanup; + } + return 0; + +hwcdrv_start_cleanup: + hwcdrv_free_counters (); // PERF_EVENT_IOC_DISABLE and close() for all fds + return HWCFUNCS_ERROR_UNAVAIL; +} + +HWCDRV_API int +hwcdrv_lwp_suspend (void) /* must be called from each thread */ +{ + if (!COUNTERS_ENABLED ()) + { + TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_suspend: no counters\n"); + return 0; + } + TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_suspend()\n"); + return hwcdrv_free_counters (); +} + +HWCDRV_API int +hwcdrv_lwp_resume (void) /* must be called from each thread */ +{ + if (!COUNTERS_ENABLED ()) + { + TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_resume: no counters\n"); + return 0; + } + TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_resume()\n"); + return hwcdrv_start (); +} + +HWCDRV_API int +hwcdrv_read_events (hwc_event_t *overflow_data, hwc_event_samples_t *sampled_data) +{ + overflow_data->ce_hrt = 0; + for (int i = 0; i < MAX_PICS; i++) + { + overflow_data->ce_pic[i] = 0; + if (sampled_data) + HWCFUNCS_SAMPLE_RESET (&sampled_data->sample[i]); + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* HWCDRV_API */ + +hwcdrv_api_t hwcdrv_pcl_api = { + hwcdrv_init, + hwcdrv_get_info, + hwcdrv_enable_mt, + hwcdrv_get_descriptions, + hwcdrv_assign_regnos, + hwcdrv_create_counters, + hwcdrv_start, + hwcdrv_overflow, + hwcdrv_read_events, + hwcdrv_sighlr_restart, + hwcdrv_lwp_suspend, + hwcdrv_lwp_resume, + hwcdrv_free_counters, + hwcdrv_lwp_init, + hwcdrv_lwp_fini, + -1 // hwcdrv_init_status +}; |