From 9af067bfcdc9a006a7d3517c8c8f805a22b36a35 Mon Sep 17 00:00:00 2001 From: Vladimir Mezentsev Date: Sat, 1 Jun 2024 20:36:06 -0700 Subject: gprofng: add hardware counters for AMD Zen4 ChangeLog 2024-06-01 Vladimir Mezentsev * common/hwctable.c: Add the hwc table for AMD Zen4. * src/hwc_amd_zen4.h: New file. * src/hwc_amd_zen3.h: Define _HWC_AMD_ZEN3_H. --- gprofng/common/hwctable.c | 22 ++ gprofng/src/hwc_amd_zen3.h | 5 +- gprofng/src/hwc_amd_zen4.h | 863 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 889 insertions(+), 1 deletion(-) create mode 100644 gprofng/src/hwc_amd_zen4.h (limited to 'gprofng') diff --git a/gprofng/common/hwctable.c b/gprofng/common/hwctable.c index 0b4800e..b3ccb36 100644 --- a/gprofng/common/hwctable.c +++ b/gprofng/common/hwctable.c @@ -1303,6 +1303,7 @@ static Hwcentry generic_list[] = { }; #include "hwc_amd_zen3.h" +#include "hwc_amd_zen4.h" /* structure defining the counters for a CPU type */ typedef struct @@ -1353,6 +1354,7 @@ static cpu_list_t cputabs[] = { {ARM_CPU_IMP_APM, generic_list, {"insts,,cycles", 0}}, {CPC_AMD_Authentic, generic_list, {"insts,,cycles", 0}}, {CPC_AMD_FAM_19H_ZEN3, amd_zen3_list, {"insts,,cycles", 0}}, + {CPC_AMD_FAM_19H_ZEN4, amd_zen4_list, {"insts,,cycles", 0}}, {0, generic_list, {"insts,,cycles", 0}}, }; @@ -1825,6 +1827,26 @@ setup_cpc_general (int skip_hwc_test) hwcdrv->hwcdrv_get_info (&cpcx_cpuver, &cpcx_cciname, &cpcx_npics, &cpcx_docref, &cpcx_support_bitmask); + /* Fix cpcx_cpuver for new Zen machines */ + cpu_info_t *cpu_p = read_cpuinfo (); + if (strcmp (cpu_p->cpu_vendorstr, "AuthenticAMD") == 0) + { + if (cpu_p->cpu_family == AMD_ZEN3_FAMILY) + switch (cpu_p->cpu_model) + { + case AMD_ZEN3_RYZEN: + case AMD_ZEN3_RYZEN2: + case AMD_ZEN3_RYZEN3: + case AMD_ZEN3_EPYC_TRENTO: + cpcx_cpuver = CPC_AMD_FAM_19H_ZEN3; + break; + case AMD_ZEN4_RYZEN: + case AMD_ZEN4_EPYC: + cpcx_cpuver = CPC_AMD_FAM_19H_ZEN4; + break; + } + } + #ifdef DISALLOW_PENTIUM_PRO_MMX_7007575 if (cpcx_cpuver == CPC_PENTIUM_PRO_MMX) { diff --git a/gprofng/src/hwc_amd_zen3.h b/gprofng/src/hwc_amd_zen3.h index a6400f5..6a2ee02 100644 --- a/gprofng/src/hwc_amd_zen3.h +++ b/gprofng/src/hwc_amd_zen3.h @@ -18,6 +18,9 @@ Foundation, 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ +#ifndef _HWC_AMD_ZEN3_H +#define _HWC_AMD_ZEN3_H + #define I(nm, event, umask, mtr) INIT_HWC(nm, mtr, (event) | ((umask) << 8), PERF_TYPE_RAW) static Hwcentry amd_zen3_list[] = { @@ -629,4 +632,4 @@ static Hwcentry amd_zen3_list[] = { }; #undef I - +#endif diff --git a/gprofng/src/hwc_amd_zen4.h b/gprofng/src/hwc_amd_zen4.h new file mode 100644 index 0000000..1750194 --- /dev/null +++ b/gprofng/src/hwc_amd_zen4.h @@ -0,0 +1,863 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + Contributed by Oracle. + + This file is part of GNU Binutils. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, 51 Franklin Street - Fifth Floor, Boston, + MA 02110-1301, USA. */ + +#ifndef _HWC_AMD_ZEN4_H +#define _HWC_AMD_ZEN4_H + +#define I(nm, event, umask, mtr) INIT_HWC(nm, mtr, (event) | ((umask) << 8), PERF_TYPE_RAW) + +static Hwcentry amd_zen4_list[] = { + HWC_GENERIC +/* branch: */ + { I("bp_de_redirect", 0x91, 0, + STXT("Instruction decoder corrects the predicted target and resteers the" + "branch predictor")) }, + { I("bp_dyn_ind_pred", 0x8e, 0, + STXT("Dynamic indirect predictions (branch used the indirect predictor to" + "make a prediction)")) }, + { I("bp_l2_btb_correct", 0x8b, 0, + STXT("L2 branch prediction overrides existing prediction (speculative)")) }, + { I("ex_ret_brn", 0xc2, 0, + STXT("Retired branch instructions (all types of architectural control flow" + "changes, including exceptions and interrupts)")) }, + { I("ex_ret_brn_far", 0xc6, 0, + STXT("Retired far control transfers (far call/jump/return, IRET, SYSCALL and" + "SYSRET, plus exceptions and interrupts). Far control transfers are not" + "subject to branch prediction")) }, + { I("ex_ret_brn_ind_misp", 0xca, 0, + STXT("Retired indirect branch instructions mispredicted (only EX" + "mispredicts). Each misprediction incurs the same penalty as a" + "mispredicted conditional branch instruction")) }, + { I("ex_ret_brn_misp", 0xc3, 0, + STXT("Retired branch instructions mispredicted")) }, + { I("ex_ret_brn_tkn", 0xc4, 0, + STXT("Retired taken branch instructions (all types of architectural control" + "flow changes, including exceptions and interrupts)")) }, + { I("ex_ret_brn_tkn_misp", 0xc5, 0, + STXT("Retired taken branch instructions mispredicted")) }, + { I("ex_ret_cond", 0xd1, 0, + STXT("Retired conditional branch instructions")) }, + { I("ex_ret_ind_brch_instr", 0xcc, 0, + STXT("Retired indirect branch instructions")) }, + { I("ex_ret_msprd_brnch_instr_dir_msmtch", 0x1c7, 0, + STXT("Retired branch instructions mispredicted due to direction mismatch")) }, + { I("ex_ret_near_ret", 0xc8, 0, + STXT("Retired near returns (RET or RET Iw)")) }, + { I("ex_ret_near_ret_mispred", 0xc9, 0, + STXT("Retired near returns mispredicted. Each misprediction incurs the same" + "penalty as a mispredicted conditional branch instruction")) }, + { I("ex_ret_uncond_brnch_instr", 0x1c9, 0, + STXT("Retired unconditional branch instructions")) }, + { I("ex_ret_uncond_brnch_instr_mispred", 0x1c8, 0, + STXT("Retired unconditional indirect branch instructions mispredicted")) }, +/* cache: */ + { I("ic_cache_fill_l2", 0x82, 0, + STXT("Instruction cache lines (64 bytes) fulfilled from the L2 cache")) }, + { I("ic_cache_fill_sys", 0x83, 0, + STXT("Instruction cache lines (64 bytes) fulfilled from system memory or" + "another cache")) }, + { I("ic_tag_hit_miss.all_instruction_cache_accesses", 0x18e, 0x1f, + STXT("Instruction cache accesses of all types")) }, + { I("ic_tag_hit_miss.instruction_cache_hit", 0x18e, 0x7, + STXT("Instruction cache hits")) }, + { I("ic_tag_hit_miss.instruction_cache_miss", 0x18e, 0x18, + STXT("Instruction cache misses")) }, + { I("l2_cache_req_stat.all", 0x64, 0xff, + STXT("Core to L2 cache requests (not including L2 prefetch) for data and" + "instruction cache access")) }, + { I("l2_cache_req_stat.dc_access_in_l2", 0x64, 0xf8, + STXT("Core to L2 cache requests (not including L2 prefetch) for data cache" + "access")) }, + { I("l2_cache_req_stat.dc_hit_in_l2", 0x64, 0xf0, + STXT("Core to L2 cache requests (not including L2 prefetch) for data cache" + "hits")) }, + { I("l2_cache_req_stat.ic_access_in_l2", 0x64, 0x7, + STXT("Core to L2 cache requests (not including L2 prefetch) for instruction" + "cache access")) }, + { I("l2_cache_req_stat.ic_dc_hit_in_l2", 0x64, 0xf6, + STXT("Core to L2 cache requests (not including L2 prefetch) for data and" + "instruction cache hits")) }, + { I("l2_cache_req_stat.ic_dc_miss_in_l2", 0x64, 0x9, + STXT("Core to L2 cache requests (not including L2 prefetch) for data and" + "instruction cache misses")) }, + { I("l2_cache_req_stat.ic_fill_hit_s", 0x64, 0x2, + STXT("Core to L2 cache requests (not including L2 prefetch) with status:" + "instruction cache hit non-modifiable line in L2")) }, + { I("l2_cache_req_stat.ic_fill_hit_x", 0x64, 0x4, + STXT("Core to L2 cache requests (not including L2 prefetch) with status:" + "instruction cache hit modifiable line in L2")) }, + { I("l2_cache_req_stat.ic_fill_miss", 0x64, 0x1, + STXT("Core to L2 cache requests (not including L2 prefetch) with status:" + "instruction cache request miss in L2")) }, + { I("l2_cache_req_stat.ic_hit_in_l2", 0x64, 0x6, + STXT("Core to L2 cache requests (not including L2 prefetch) for instruction" + "cache hits")) }, + { I("l2_cache_req_stat.ls_rd_blk_c", 0x64, 0x8, + STXT("Core to L2 cache requests (not including L2 prefetch) with status:" + "data cache request miss in L2")) }, + { I("l2_cache_req_stat.ls_rd_blk_cs", 0x64, 0x80, + STXT("Core to L2 cache requests (not including L2 prefetch) with status:" + "data cache shared read hit in L2")) }, + { I("l2_cache_req_stat.ls_rd_blk_l_hit_s", 0x64, 0x20, + STXT("Core to L2 cache requests (not including L2 prefetch) with status:" + "data cache read hit non-modifiable line in L2")) }, + { I("l2_cache_req_stat.ls_rd_blk_l_hit_x", 0x64, 0x40, + STXT("Core to L2 cache requests (not including L2 prefetch) with status:" + "data cache read hit modifiable line in L2")) }, + { I("l2_cache_req_stat.ls_rd_blk_x", 0x64, 0x10, + STXT("Core to L2 cache requests (not including L2 prefetch) with status:" + "data cache store or state change hit in L2")) }, + { I("l2_pf_hit_l2.all", 0x70, 0xff, + STXT("L2 prefetches accepted by the L2 pipeline which hit in the L2 cache of" + "all types")) }, + { I("l2_pf_hit_l2.l1_region", 0x70, 0x80, + STXT("L2 prefetches accepted by the L2 pipeline which hit in the L2 cache of" + "type L1Region (fetch additional lines into L1 cache when the data" + "access for a given instruction tends to be followed by a consistent" + "pattern of other accesses within a localized region)")) }, + { I("l2_pf_hit_l2.l1_stream", 0x70, 0x20, + STXT("L2 prefetches accepted by the L2 pipeline which hit in the L2 cache of" + "type L1Stream (fetch additional sequential lines into L1 cache)")) }, + { I("l2_pf_hit_l2.l1_stride", 0x70, 0x40, + STXT("L2 prefetches accepted by the L2 pipeline which hit in the L2 cache of" + "type L1Stride (fetch additional lines into L1 cache when each access" + "is a constant distance from the previous)")) }, + { I("l2_pf_hit_l2.l2_burst", 0x70, 0x8, + STXT("L2 prefetches accepted by the L2 pipeline which hit in the L2 cache of" + "type L2Burst (aggressively fetch additional sequential lines into L2" + "cache)")) }, + { I("l2_pf_hit_l2.l2_next_line", 0x70, 0x2, + STXT("L2 prefetches accepted by the L2 pipeline which hit in the L2 cache of" + "type L2NextLine (fetch the next line into L2 cache)")) }, + { I("l2_pf_hit_l2.l2_stream", 0x70, 0x1, + STXT("L2 prefetches accepted by the L2 pipeline which hit in the L2 cache of" + "type L2Stream (fetch additional sequential lines into L2 cache)")) }, + { I("l2_pf_hit_l2.l2_stride", 0x70, 0x10, + STXT("L2 prefetches accepted by the L2 pipeline which hit in the L2 cache of" + "type L2Stride (fetch additional lines into L2 cache when each access" + "is at a constant distance from the previous)")) }, + { I("l2_pf_hit_l2.l2_up_down", 0x70, 0x4, + STXT("L2 prefetches accepted by the L2 pipeline which hit in the L2 cache of" + "type L2UpDown (fetch the next or previous line into L2 cache for all" + "memory accesses)")) }, + { I("l2_pf_miss_l2_hit_l3.all", 0x71, 0xff, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 cache and" + "hit in the L3 cache cache of all types")) }, + { I("l2_pf_miss_l2_hit_l3.l1_region", 0x71, 0x80, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 cache and" + "hit in the L3 cache of type L1Region (fetch additional lines into L1" + "cache when the data access for a given instruction tends to be" + "followed by a consistent pattern of other accesses within a localized" + "region)")) }, + { I("l2_pf_miss_l2_hit_l3.l1_stream", 0x71, 0x20, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 cache and" + "hit in the L3 cache of type L1Stream (fetch additional sequential" + "lines into L1 cache)")) }, + { I("l2_pf_miss_l2_hit_l3.l1_stride", 0x71, 0x40, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 cache and" + "hit in the L3 cache of type L1Stride (fetch additional lines into L1" + "cache when each access is a constant distance from the previous)")) }, + { I("l2_pf_miss_l2_hit_l3.l2_burst", 0x71, 0x8, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 cache and" + "hit in the L3 cache of type L2Burst (aggressively fetch additional" + "sequential lines into L2 cache)")) }, + { I("l2_pf_miss_l2_hit_l3.l2_next_line", 0x71, 0x2, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 cache and" + "hit in the L3 cache of type L2NextLine (fetch the next line into L2" + "cache)")) }, + { I("l2_pf_miss_l2_hit_l3.l2_stream", 0x71, 0x1, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 cache and" + "hit in the L3 cache of type L2Stream (fetch additional sequential" + "lines into L2 cache)")) }, + { I("l2_pf_miss_l2_hit_l3.l2_stride", 0x71, 0x10, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 cache and" + "hit in the L3 cache of type L2Stride (fetch additional lines into L2" + "cache when each access is a constant distance from the previous)")) }, + { I("l2_pf_miss_l2_hit_l3.l2_up_down", 0x71, 0x4, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 cache and" + "hit in the L3 cache of type L2UpDown (fetch the next or previous line" + "into L2 cache for all memory accesses)")) }, + { I("l2_pf_miss_l2_l3.all", 0x72, 0xff, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3" + "caches of all types")) }, + { I("l2_pf_miss_l2_l3.l1_region", 0x72, 0x80, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3" + "caches of type L1Region (fetch additional lines into L1 cache when the" + "data access for a given instruction tends to be followed by a" + "consistent pattern of other accesses within a localized region)")) }, + { I("l2_pf_miss_l2_l3.l1_stream", 0x72, 0x20, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3" + "caches of type L1Stream (fetch additional sequential lines into L1" + "cache)")) }, + { I("l2_pf_miss_l2_l3.l1_stride", 0x72, 0x40, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3" + "caches of type L1Stride (fetch additional lines into L1 cache when" + "each access is a constant distance from the previous)")) }, + { I("l2_pf_miss_l2_l3.l2_burst", 0x72, 0x8, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3" + "caches of type L2Burst (aggressively fetch additional sequential lines" + "into L2 cache)")) }, + { I("l2_pf_miss_l2_l3.l2_next_line", 0x72, 0x2, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3" + "caches of type L2NextLine (fetch the next line into L2 cache)")) }, + { I("l2_pf_miss_l2_l3.l2_stream", 0x72, 0x1, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3" + "caches of type L2Stream (fetch additional sequential lines into L2" + "cache)")) }, + { I("l2_pf_miss_l2_l3.l2_stride", 0x72, 0x10, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3" + "caches of type L2Stride (fetch additional lines into L2 cache when" + "each access is a constant distance from the previous)")) }, + { I("l2_pf_miss_l2_l3.l2_up_down", 0x72, 0x4, + STXT("L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3" + "caches of type L2UpDown (fetch the next or previous line into L2 cache" + "for all memory accesses)")) }, + { I("l2_request_g1.all", 0x60, 0xff, + STXT("L2 cache requests of all types")) }, + { I("l2_request_g1.all_dc", 0x60, 0xe8, + STXT("L2 cache requests of common types from L1 data cache (including" + "prefetches)")) }, + { I("l2_request_g1.all_no_prefetch", 0x60, 0xf9, + STXT("L2 cache requests of common types not including prefetches")) }, + { I("l2_request_g1.cacheable_ic_read", 0x60, 0x10, + STXT("L2 cache requests: instruction cache reads")) }, + { I("l2_request_g1.change_to_x", 0x60, 0x8, + STXT("L2 cache requests: data cache state change to writable, check L2 for" + "current state")) }, + { I("l2_request_g1.group2", 0x60, 0x1, + STXT("L2 cache requests of non-cacheable type (non-cached data and" + "instructions reads, self-modifying code checks)")) }, + { I("l2_request_g1.l2_hw_pf", 0x60, 0x2, + STXT("L2 cache requests: from hardware prefetchers to prefetch directly into" + "L2 (hit or miss)")) }, + { I("l2_request_g1.ls_rd_blk_c_s", 0x60, 0x20, + STXT("L2 cache requests: data cache shared reads")) }, + { I("l2_request_g1.prefetch_l2_cmd", 0x60, 0x4, + STXT("L2 cache requests: prefetch directly into L2")) }, + { I("l2_request_g1.rd_blk_l", 0x60, 0x80, + STXT("L2 cache requests: data cache reads including hardware and software" + "prefetch")) }, + { I("l2_request_g1.rd_blk_x", 0x60, 0x40, + STXT("L2 cache requests: data cache stores")) }, + { I("ls_alloc_mab_count", 0x5f, 0, + STXT("In-flight L1 data cache misses i.e. Miss Address Buffer (MAB)" + "allocations each cycle")) }, + { I("ls_any_fills_from_sys.all", 0x44, 0xff, + STXT("Any data cache fills from all types of data sources")) }, + { I("ls_any_fills_from_sys.all_dram_io", 0x44, 0x48, + STXT("Any data cache fills from either DRAM or MMIO in any NUMA node (same" + "or different socket)")) }, + { I("ls_any_fills_from_sys.alternate_memories", 0x44, 0x80, + STXT("Any data cache fills from extension memory")) }, + { I("ls_any_fills_from_sys.dram_io_all", 0x44, 0x48, + STXT("Any data cache fills from either DRAM or MMIO in any NUMA node (same" + "or different socket)")) }, + { I("ls_any_fills_from_sys.dram_io_far", 0x44, 0x40, + STXT("Any data cache fills from either DRAM or MMIO in a different NUMA node" + "(same or different socket)")) }, + { I("ls_any_fills_from_sys.dram_io_near", 0x44, 0x8, + STXT("Any data cache fills from either DRAM or MMIO in the same NUMA node")) }, + { I("ls_any_fills_from_sys.far_all", 0x44, 0x50, + STXT("Any data cache fills from either cache of another CCX, DRAM or MMIO" + "when the address was in a different NUMA node (same or different" + "socket)")) }, + { I("ls_any_fills_from_sys.far_cache", 0x44, 0x10, + STXT("Any data cache fills from cache of another CCX when the address was in" + "a different NUMA node")) }, + { I("ls_any_fills_from_sys.local_all", 0x44, 0x3, + STXT("Any data cache fills from local L2 cache or L3 cache or different L2" + "cache in the same CCX")) }, + { I("ls_any_fills_from_sys.local_ccx", 0x44, 0x2, + STXT("Any data cache fills from L3 cache or different L2 cache in the same" + "CCX")) }, + { I("ls_any_fills_from_sys.local_l2", 0x44, 0x1, + STXT("Any data cache fills from local L2 cache")) }, + { I("ls_any_fills_from_sys.near_cache", 0x44, 0x4, + STXT("Any data cache fills from cache of another CCX when the address was in" + "the same NUMA node")) }, + { I("ls_any_fills_from_sys.remote_cache", 0x44, 0x14, + STXT("Any data cache fills from cache of another CCX when the address was in" + "the same or a different NUMA node")) }, + { I("ls_dmnd_fills_from_sys.all", 0x43, 0xff, + STXT("Demand data cache fills from all types of data sources")) }, + { I("ls_dmnd_fills_from_sys.alternate_memories", 0x43, 0x80, + STXT("Demand data cache fills from extension memory")) }, + { I("ls_dmnd_fills_from_sys.dram_io_far", 0x43, 0x40, + STXT("Demand data cache fills from either DRAM or MMIO in a different NUMA" + "node (same or different socket)")) }, + { I("ls_dmnd_fills_from_sys.dram_io_near", 0x43, 0x8, + STXT("Demand data cache fills from either DRAM or MMIO in the same NUMA node")) }, + { I("ls_dmnd_fills_from_sys.far_cache", 0x43, 0x10, + STXT("Demand data cache fills from cache of another CCX when the address was" + "in a different NUMA node")) }, + { I("ls_dmnd_fills_from_sys.local_ccx", 0x43, 0x2, + STXT("Demand data cache fills from L3 cache or different L2 cache in the" + "same CCX")) }, + { I("ls_dmnd_fills_from_sys.local_l2", 0x43, 0x1, + STXT("Demand data cache fills from local L2 cache")) }, + { I("ls_dmnd_fills_from_sys.near_cache", 0x43, 0x4, + STXT("Demand data cache fills from cache of another CCX when the address was" + "in the same NUMA node")) }, + { I("ls_hw_pf_dc_fills.all", 0x5a, 0xdf, + STXT("Hardware prefetch data cache fills from all types of data sources")) }, + { I("ls_hw_pf_dc_fills.alternate_memories", 0x5a, 0x80, + STXT("Hardware prefetch data cache fills from extension memory")) }, + { I("ls_hw_pf_dc_fills.dram_io_far", 0x5a, 0x40, + STXT("Hardware prefetch data cache fills from either DRAM or MMIO in a" + "different NUMA node (same or different socket)")) }, + { I("ls_hw_pf_dc_fills.dram_io_near", 0x5a, 0x8, + STXT("Hardware prefetch data cache fills from either DRAM or MMIO in the" + "same NUMA node")) }, + { I("ls_hw_pf_dc_fills.far_cache", 0x5a, 0x10, + STXT("Hardware prefetch data cache fills from cache of another CCX when the" + "address was in a different NUMA node")) }, + { I("ls_hw_pf_dc_fills.local_ccx", 0x5a, 0x2, + STXT("Hardware prefetch data cache fills from L3 cache or different L2 cache" + "in the same CCX")) }, + { I("ls_hw_pf_dc_fills.local_l2", 0x5a, 0x1, + STXT("Hardware prefetch data cache fills from local L2 cache")) }, + { I("ls_hw_pf_dc_fills.near_cache", 0x5a, 0x4, + STXT("Hardware prefetch data cache fills from cache of another CCX when the" + "address was in the same NUMA node")) }, + { I("ls_inef_sw_pref.all", 0x52, 0x3, STXT("(null)")) }, + { I("ls_inef_sw_pref.data_pipe_sw_pf_dc_hit", 0x52, 0x1, + STXT("Software prefetches that did not fetch data outside of the processor" + "core as the PREFETCH instruction saw a data cache hit")) }, + { I("ls_inef_sw_pref.mab_mch_cnt", 0x52, 0x2, + STXT("Software prefetches that did not fetch data outside of the processor" + "core as the PREFETCH instruction saw a match on an already allocated" + "Miss Address Buffer (MAB)")) }, + { I("ls_mab_alloc.all_allocations", 0x41, 0x7f, + STXT("Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe" + "for all types of allocations")) }, + { I("ls_mab_alloc.hardware_prefetcher_allocations", 0x41, 0x40, + STXT("Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe" + "for hardware prefetcher allocations")) }, + { I("ls_mab_alloc.load_store_allocations", 0x41, 0x3f, + STXT("Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe" + "for load-store allocations")) }, + { I("ls_pref_instr_disp.all", 0x4b, 0x7, + STXT("Software prefetch instructions dispatched (speculative) of all types")) }, + { I("ls_pref_instr_disp.prefetch", 0x4b, 0x1, + STXT("Software prefetch instructions dispatched (speculative) of type" + "PrefetchT0 (move data to all cache levels), T1 (move data to all cache" + "levels except L1) and T2 (move data to all cache levels except L1 and" + "L2)")) }, + { I("ls_pref_instr_disp.prefetch_nta", 0x4b, 0x4, + STXT("Software prefetch instructions dispatched (speculative) of type" + "PrefetchNTA (move data with minimum cache pollution i.e. non-temporal" + "access)")) }, + { I("ls_pref_instr_disp.prefetch_w", 0x4b, 0x2, + STXT("Software prefetch instructions dispatched (speculative) of type" + "PrefetchW (move data to L1 cache and mark it modifiable)")) }, + { I("ls_sw_pf_dc_fills.all", 0x59, 0xdf, + STXT("Software prefetch data cache fills from all types of data sources")) }, + { I("ls_sw_pf_dc_fills.alternate_memories", 0x59, 0x80, + STXT("Software prefetch data cache fills from extension memory")) }, + { I("ls_sw_pf_dc_fills.dram_io_far", 0x59, 0x40, + STXT("Software prefetch data cache fills from either DRAM or MMIO in a" + "different NUMA node (same or different socket)")) }, + { I("ls_sw_pf_dc_fills.dram_io_near", 0x59, 0x8, + STXT("Software prefetch data cache fills from either DRAM or MMIO in the" + "same NUMA node")) }, + { I("ls_sw_pf_dc_fills.far_cache", 0x59, 0x10, + STXT("Software prefetch data cache fills from cache of another CCX in a" + "different NUMA node")) }, + { I("ls_sw_pf_dc_fills.local_ccx", 0x59, 0x2, + STXT("Software prefetch data cache fills from L3 cache or different L2 cache" + "in the same CCX")) }, + { I("ls_sw_pf_dc_fills.local_l2", 0x59, 0x1, + STXT("Software prefetch data cache fills from local L2 cache")) }, + { I("ls_sw_pf_dc_fills.near_cache", 0x59, 0x4, + STXT("Software prefetch data cache fills from cache of another CCX in the" + "same NUMA node")) }, + { I("op_cache_hit_miss.all_op_cache_accesses", 0x28f, 0x7, + STXT("Op cache accesses of all types")) }, + { I("op_cache_hit_miss.op_cache_hit", 0x28f, 0x3, STXT("Op cache hits")) }, + { I("op_cache_hit_miss.op_cache_miss", 0x28f, 0x4, + STXT("Op cache misses")) }, +/* core: */ + { I("ex_div_busy", 0xd3, 0, STXT("Number of cycles the divider is busy")) }, + { I("ex_div_count", 0xd4, 0, STXT("Divide ops executed")) }, + { I("ex_no_retire.all", 0xd6, 0x1b, + STXT("Cycles with no retire for any reason")) }, + { I("ex_no_retire.empty", 0xd6, 0x1, + STXT("Cycles with no retire due to the lack of valid ops in the retire queue" + "(may be caused by front-end bottlenecks or pipeline redirects)")) }, + { I("ex_no_retire.load_not_complete", 0xd6, 0xa2, + STXT("Cycles with no retire while the oldest op is waiting for load data")) }, + { I("ex_no_retire.not_complete", 0xd6, 0x2, + STXT("Cycles with no retire while the oldest op is waiting to be executed")) }, + { I("ex_no_retire.other", 0xd6, 0x8, + STXT("Cycles with no retire caused by other reasons (retire breaks, traps," + "faults, etc.)")) }, + { I("ex_no_retire.thread_not_selected", 0xd6, 0x10, + STXT("Cycles with no retire because thread arbitration did not select the" + "thread")) }, + { I("ex_ret_fused_instr", 0x1d0, 0, STXT("Retired fused instructions")) }, + { I("ex_ret_instr", 0xc0, 0, STXT("Retired instructions")) }, + { I("ex_ret_ops", 0xc1, 0, STXT("Retired macro-ops")) }, + { I("ex_ret_ucode_instr", 0x1c1, 0, + STXT("Retired microcoded instructions")) }, + { I("ex_ret_ucode_ops", 0x1c2, 0, STXT("Retired microcode ops")) }, + { I("ex_tagged_ibs_ops.ibs_tagged_ops", 0x1cf, 0x1, + STXT("Ops tagged by IBS")) }, + { I("ex_tagged_ibs_ops.ibs_tagged_ops_ret", 0x1cf, 0x2, + STXT("Ops tagged by IBS that retired")) }, + { I("ls_int_taken", 0x2c, 0, STXT("Interrupts taken")) }, + { I("ls_locks.bus_lock", 0x25, 0x1, + STXT("Retired Lock instructions which caused a bus lock")) }, + { I("ls_not_halted_cyc", 0x76, 0, STXT("Core cycles not in halt")) }, + { I("ls_not_halted_p0_cyc.p0_freq_cyc", 0x120, 0x1, + STXT("Reference cycles (P0 frequency) not in halt")) }, + { I("ls_ret_cl_flush", 0x26, 0, STXT("Retired CLFLUSH instructions")) }, + { I("ls_ret_cpuid", 0x27, 0, STXT("Retired CPUID instructions")) }, + { I("ls_smi_rx", 0x2b, 0, STXT("SMIs received")) }, +/* floating point: */ + { I("fp_disp_faults.all", 0xe, 0xf, + STXT("Floating-point dispatch faults of all types")) }, + { I("fp_disp_faults.sse_avx_all", 0xe, 0xe, + STXT("Floating-point dispatch faults of all types for SSE and AVX ops")) }, + { I("fp_disp_faults.x87_fill_fault", 0xe, 0x1, + STXT("Floating-point dispatch faults for x87 fills")) }, + { I("fp_disp_faults.xmm_fill_fault", 0xe, 0x2, + STXT("Floating-point dispatch faults for XMM fills")) }, + { I("fp_disp_faults.ymm_fill_fault", 0xe, 0x4, + STXT("Floating-point dispatch faults for YMM fills")) }, + { I("fp_disp_faults.ymm_spill_fault", 0xe, 0x8, + STXT("Floating-point dispatch faults for YMM spills")) }, + { I("fp_ops_retired_by_type.all", 0xa, 0xff, + STXT("Retired floating-point ops of all types")) }, + { I("fp_ops_retired_by_type.scalar_add", 0xa, 0x1, + STXT("Retired scalar floating-point add ops")) }, + { I("fp_ops_retired_by_type.scalar_all", 0xa, 0xf, + STXT("Retired scalar floating-point ops of all types")) }, + { I("fp_ops_retired_by_type.scalar_blend", 0xa, 0x9, + STXT("Retired scalar floating-point blend ops")) }, + { I("fp_ops_retired_by_type.scalar_cmp", 0xa, 0x7, + STXT("Retired scalar floating-point compare ops")) }, + { I("fp_ops_retired_by_type.scalar_cvt", 0xa, 0x8, + STXT("Retired scalar floating-point convert ops")) }, + { I("fp_ops_retired_by_type.scalar_div", 0xa, 0x5, + STXT("Retired scalar floating-point divide ops")) }, + { I("fp_ops_retired_by_type.scalar_mac", 0xa, 0x4, + STXT("Retired scalar floating-point multiply-accumulate ops")) }, + { I("fp_ops_retired_by_type.scalar_mul", 0xa, 0x3, + STXT("Retired scalar floating-point multiply ops")) }, + { I("fp_ops_retired_by_type.scalar_other", 0xa, 0xe, + STXT("Retired scalar floating-point ops of other types")) }, + { I("fp_ops_retired_by_type.scalar_sqrt", 0xa, 0x6, + STXT("Retired scalar floating-point square root ops")) }, + { I("fp_ops_retired_by_type.scalar_sub", 0xa, 0x2, + STXT("Retired scalar floating-point subtract ops")) }, + { I("fp_ops_retired_by_type.vector_add", 0xa, 0x10, + STXT("Retired vector floating-point add ops")) }, + { I("fp_ops_retired_by_type.vector_all", 0xa, 0xf0, + STXT("Retired vector floating-point ops of all types")) }, + { I("fp_ops_retired_by_type.vector_blend", 0xa, 0x90, + STXT("Retired vector floating-point blend ops")) }, + { I("fp_ops_retired_by_type.vector_cmp", 0xa, 0x70, + STXT("Retired vector floating-point compare ops")) }, + { I("fp_ops_retired_by_type.vector_cvt", 0xa, 0x80, + STXT("Retired vector floating-point convert ops")) }, + { I("fp_ops_retired_by_type.vector_div", 0xa, 0x50, + STXT("Retired vector floating-point divide ops")) }, + { I("fp_ops_retired_by_type.vector_logical", 0xa, 0xd0, + STXT("Retired vector floating-point logical ops")) }, + { I("fp_ops_retired_by_type.vector_mac", 0xa, 0x40, + STXT("Retired vector floating-point multiply-accumulate ops")) }, + { I("fp_ops_retired_by_type.vector_mul", 0xa, 0x30, + STXT("Retired vector floating-point multiply ops")) }, + { I("fp_ops_retired_by_type.vector_other", 0xa, 0xe0, + STXT("Retired vector floating-point ops of other types")) }, + { I("fp_ops_retired_by_type.vector_shuffle", 0xa, 0xb0, + STXT("Retired vector floating-point shuffle ops (may include instructions" + "not necessarily thought of as including shuffles e.g. horizontal add," + "dot product, and certain MOV instructions)")) }, + { I("fp_ops_retired_by_type.vector_sqrt", 0xa, 0x60, + STXT("Retired vector floating-point square root ops")) }, + { I("fp_ops_retired_by_type.vector_sub", 0xa, 0x20, + STXT("Retired vector floating-point subtract ops")) }, + { I("fp_ops_retired_by_width.all", 0x8, 0x3f, + STXT("Retired floating-point ops of all widths")) }, + { I("fp_ops_retired_by_width.mmx_uops_retired", 0x8, 0x2, + STXT("Retired MMX floating-point ops")) }, + { I("fp_ops_retired_by_width.pack_128_uops_retired", 0x8, 0x8, + STXT("Retired packed 128-bit floating-point ops")) }, + { I("fp_ops_retired_by_width.pack_256_uops_retired", 0x8, 0x10, + STXT("Retired packed 256-bit floating-point ops")) }, + { I("fp_ops_retired_by_width.pack_512_uops_retired", 0x8, 0x20, + STXT("Retired packed 512-bit floating-point ops")) }, + { I("fp_ops_retired_by_width.scalar_uops_retired", 0x8, 0x4, + STXT("Retired scalar floating-point ops")) }, + { I("fp_ops_retired_by_width.x87_uops_retired", 0x8, 0x1, + STXT("Retired x87 floating-point ops")) }, + { I("fp_pack_ops_retired.all", 0xc, 0xff, + STXT("Retired packed floating-point ops of all types")) }, + { I("fp_pack_ops_retired.fp128_add", 0xc, 0x1, + STXT("Retired 128-bit packed floating-point add ops")) }, + { I("fp_pack_ops_retired.fp128_all", 0xc, 0xf, + STXT("Retired 128-bit packed floating-point ops of all types")) }, + { I("fp_pack_ops_retired.fp128_blend", 0xc, 0x9, + STXT("Retired 128-bit packed floating-point blend ops")) }, + { I("fp_pack_ops_retired.fp128_cmp", 0xc, 0x7, + STXT("Retired 128-bit packed floating-point compare ops")) }, + { I("fp_pack_ops_retired.fp128_cvt", 0xc, 0x8, + STXT("Retired 128-bit packed floating-point convert ops")) }, + { I("fp_pack_ops_retired.fp128_div", 0xc, 0x5, + STXT("Retired 128-bit packed floating-point divide ops")) }, + { I("fp_pack_ops_retired.fp128_logical", 0xc, 0xd, + STXT("Retired 128-bit packed floating-point logical ops")) }, + { I("fp_pack_ops_retired.fp128_mac", 0xc, 0x4, + STXT("Retired 128-bit packed floating-point multiply-accumulate ops")) }, + { I("fp_pack_ops_retired.fp128_mul", 0xc, 0x3, + STXT("Retired 128-bit packed floating-point multiply ops")) }, + { I("fp_pack_ops_retired.fp128_other", 0xc, 0xe, + STXT("Retired 128-bit packed floating-point ops of other types")) }, + { I("fp_pack_ops_retired.fp128_shuffle", 0xc, 0xb, + STXT("Retired 128-bit packed floating-point shuffle ops (may include" + "instructions not necessarily thought of as including shuffles e.g." + "horizontal add, dot product, and certain MOV instructions)")) }, + { I("fp_pack_ops_retired.fp128_sqrt", 0xc, 0x6, + STXT("Retired 128-bit packed floating-point square root ops")) }, + { I("fp_pack_ops_retired.fp128_sub", 0xc, 0x2, + STXT("Retired 128-bit packed floating-point subtract ops")) }, + { I("fp_pack_ops_retired.fp256_add", 0xc, 0x10, + STXT("Retired 256-bit packed floating-point add ops")) }, + { I("fp_pack_ops_retired.fp256_all", 0xc, 0xf0, + STXT("Retired 256-bit packed floating-point ops of all types")) }, + { I("fp_pack_ops_retired.fp256_blend", 0xc, 0x90, + STXT("Retired 256-bit packed floating-point blend ops")) }, + { I("fp_pack_ops_retired.fp256_cmp", 0xc, 0x70, + STXT("Retired 256-bit packed floating-point compare ops")) }, + { I("fp_pack_ops_retired.fp256_cvt", 0xc, 0x80, + STXT("Retired 256-bit packed floating-point convert ops")) }, + { I("fp_pack_ops_retired.fp256_div", 0xc, 0x50, + STXT("Retired 256-bit packed floating-point divide ops")) }, + { I("fp_pack_ops_retired.fp256_logical", 0xc, 0xd0, + STXT("Retired 256-bit packed floating-point logical ops")) }, + { I("fp_pack_ops_retired.fp256_mac", 0xc, 0x40, + STXT("Retired 256-bit packed floating-point multiply-accumulate ops")) }, + { I("fp_pack_ops_retired.fp256_mul", 0xc, 0x30, + STXT("Retired 256-bit packed floating-point multiply ops")) }, + { I("fp_pack_ops_retired.fp256_other", 0xc, 0xe0, + STXT("Retired 256-bit packed floating-point ops of other types")) }, + { I("fp_pack_ops_retired.fp256_shuffle", 0xc, 0xb0, + STXT("Retired 256-bit packed floating-point shuffle ops (may include" + "instructions not necessarily thought of as including shuffles e.g." + "horizontal add, dot product, and certain MOV instructions)")) }, + { I("fp_pack_ops_retired.fp256_sqrt", 0xc, 0x60, + STXT("Retired 256-bit packed floating-point square root ops")) }, + { I("fp_pack_ops_retired.fp256_sub", 0xc, 0x20, + STXT("Retired 256-bit packed floating-point subtract ops")) }, + { I("fp_ret_sse_avx_ops.add_sub_flops", 0x3, 0x1, + STXT("Retired SSE and AVX floating-point add and subtract ops")) }, + { I("fp_ret_sse_avx_ops.all", 0x3, 0x1f, + STXT("Retired SSE and AVX floating-point ops of all types")) }, + { I("fp_ret_sse_avx_ops.bfloat_mac_flops", 0x3, 0x10, + STXT("Retired SSE and AVX floating-point bfloat multiply-accumulate ops" + "(each operation is counted as 2 ops)")) }, + { I("fp_ret_sse_avx_ops.div_flops", 0x3, 0x4, + STXT("Retired SSE and AVX floating-point divide and square root ops")) }, + { I("fp_ret_sse_avx_ops.mac_flops", 0x3, 0x8, + STXT("Retired SSE and AVX floating-point multiply-accumulate ops (each" + "operation is counted as 2 ops)")) }, + { I("fp_ret_sse_avx_ops.mult_flops", 0x3, 0x2, + STXT("Retired SSE and AVX floating-point multiply ops")) }, + { I("fp_ret_x87_fp_ops.add_sub_ops", 0x2, 0x1, + STXT("Retired x87 floating-point add and subtract ops")) }, + { I("fp_ret_x87_fp_ops.all", 0x2, 0x7, + STXT("Retired x87 floating-point ops of all types")) }, + { I("fp_ret_x87_fp_ops.div_sqrt_ops", 0x2, 0x4, + STXT("Retired x87 floating-point divide and square root ops")) }, + { I("fp_ret_x87_fp_ops.mul_ops", 0x2, 0x2, + STXT("Retired x87 floating-point multiply ops")) }, + { I("fp_retired_ser_ops.all", 0x5, 0xf, + STXT("Retired SSE and AVX serializing ops of all types")) }, + { I("fp_retired_ser_ops.sse_bot_ret", 0x5, 0x8, + STXT("Retired SSE and AVX bottom-executing ops. Bottom-executing ops wait" + "for all older ops to retire before executing")) }, + { I("fp_retired_ser_ops.sse_ctrl_ret", 0x5, 0x4, + STXT("Retired SSE and AVX control word mispredict traps")) }, + { I("fp_retired_ser_ops.x87_bot_ret", 0x5, 0x2, + STXT("Retired x87 bottom-executing ops. Bottom-executing ops wait for all" + "older ops to retire before executing")) }, + { I("fp_retired_ser_ops.x87_ctrl_ret", 0x5, 0x1, + STXT("Retired x87 control word mispredict traps due to mispredictions in RC" + "or PC, or changes in exception mask bits")) }, + { I("packed_int_op_type.all", 0xd, 0xff, + STXT("Retired packed integer ops of all types")) }, + { I("packed_int_op_type.int128_add", 0xd, 0x1, + STXT("Retired 128-bit packed integer add ops")) }, + { I("packed_int_op_type.int128_aes", 0xd, 0x5, + STXT("Retired 128-bit packed integer AES ops")) }, + { I("packed_int_op_type.int128_all", 0xd, 0xf, + STXT("Retired 128-bit packed integer ops of all types")) }, + { I("packed_int_op_type.int128_clm", 0xd, 0x8, + STXT("Retired 128-bit packed integer CLM ops")) }, + { I("packed_int_op_type.int128_cmp", 0xd, 0x7, + STXT("Retired 128-bit packed integer compare ops")) }, + { I("packed_int_op_type.int128_logical", 0xd, 0xd, + STXT("Retired 128-bit packed integer logical ops")) }, + { I("packed_int_op_type.int128_mac", 0xd, 0x4, + STXT("Retired 128-bit packed integer multiply-accumulate ops")) }, + { I("packed_int_op_type.int128_mov", 0xd, 0xa, + STXT("Retired 128-bit packed integer MOV ops")) }, + { I("packed_int_op_type.int128_mul", 0xd, 0x3, + STXT("Retired 128-bit packed integer multiply ops")) }, + { I("packed_int_op_type.int128_other", 0xd, 0xe, + STXT("Retired 128-bit packed integer ops of other types")) }, + { I("packed_int_op_type.int128_pack", 0xd, 0xc, + STXT("Retired 128-bit packed integer pack ops")) }, + { I("packed_int_op_type.int128_sha", 0xd, 0x6, + STXT("Retired 128-bit packed integer SHA ops")) }, + { I("packed_int_op_type.int128_shift", 0xd, 0x9, + STXT("Retired 128-bit packed integer shift ops")) }, + { I("packed_int_op_type.int128_shuffle", 0xd, 0xb, + STXT("Retired 128-bit packed integer shuffle ops (may include instructions" + "not necessarily thought of as including shuffles e.g. horizontal add," + "dot product, and certain MOV instructions)")) }, + { I("packed_int_op_type.int128_sub", 0xd, 0x2, + STXT("Retired 128-bit packed integer subtract ops")) }, + { I("packed_int_op_type.int256_add", 0xd, 0x10, + STXT("Retired 256-bit packed integer add ops")) }, + { I("packed_int_op_type.int256_all", 0xd, 0xf0, + STXT("Retired 256-bit packed integer ops of all types")) }, + { I("packed_int_op_type.int256_cmp", 0xd, 0x70, + STXT("Retired 256-bit packed integer compare ops")) }, + { I("packed_int_op_type.int256_logical", 0xd, 0xd0, + STXT("Retired 256-bit packed integer logical ops")) }, + { I("packed_int_op_type.int256_mac", 0xd, 0x40, + STXT("Retired 256-bit packed integer multiply-accumulate ops")) }, + { I("packed_int_op_type.int256_mov", 0xd, 0xa0, + STXT("Retired 256-bit packed integer MOV ops")) }, + { I("packed_int_op_type.int256_mul", 0xd, 0x30, + STXT("Retired 256-bit packed integer multiply ops")) }, + { I("packed_int_op_type.int256_other", 0xd, 0xe0, + STXT("Retired 256-bit packed integer ops of other types")) }, + { I("packed_int_op_type.int256_pack", 0xd, 0xc0, + STXT("Retired 256-bit packed integer pack ops")) }, + { I("packed_int_op_type.int256_shift", 0xd, 0x90, + STXT("Retired 256-bit packed integer shift ops")) }, + { I("packed_int_op_type.int256_shuffle", 0xd, 0xb0, + STXT("Retired 256-bit packed integer shuffle ops (may include instructions" + "not necessarily thought of as including shuffles e.g. horizontal add," + "dot product, and certain MOV instructions)")) }, + { I("packed_int_op_type.int256_sub", 0xd, 0x20, + STXT("Retired 256-bit packed integer subtract ops")) }, + { I("sse_avx_ops_retired.all", 0xb, 0xff, + STXT("Retired SSE, AVX and MMX integer ops of all types")) }, + { I("sse_avx_ops_retired.mmx_add", 0xb, 0x1, + STXT("Retired MMX integer add")) }, + { I("sse_avx_ops_retired.mmx_all", 0xb, 0xf, + STXT("Retired MMX integer ops of all types")) }, + { I("sse_avx_ops_retired.mmx_cmp", 0xb, 0x7, + STXT("Retired MMX integer compare ops")) }, + { I("sse_avx_ops_retired.mmx_logical", 0xb, 0xd, + STXT("Retired MMX integer logical ops")) }, + { I("sse_avx_ops_retired.mmx_mac", 0xb, 0x4, + STXT("Retired MMX integer multiply-accumulate ops")) }, + { I("sse_avx_ops_retired.mmx_mov", 0xb, 0xa, + STXT("Retired MMX integer MOV ops")) }, + { I("sse_avx_ops_retired.mmx_mul", 0xb, 0x3, + STXT("Retired MMX integer multiply ops")) }, + { I("sse_avx_ops_retired.mmx_other", 0xb, 0xe, + STXT("Retired MMX integer multiply ops of other types")) }, + { I("sse_avx_ops_retired.mmx_pack", 0xb, 0xc, + STXT("Retired MMX integer pack ops")) }, + { I("sse_avx_ops_retired.mmx_shift", 0xb, 0x9, + STXT("Retired MMX integer shift ops")) }, + { I("sse_avx_ops_retired.mmx_shuffle", 0xb, 0xb, + STXT("Retired MMX integer shuffle ops (may include instructions not" + "necessarily thought of as including shuffles e.g. horizontal add, dot" + "product, and certain MOV instructions)")) }, + { I("sse_avx_ops_retired.mmx_sub", 0xb, 0x2, + STXT("Retired MMX integer subtract ops")) }, + { I("sse_avx_ops_retired.sse_avx_add", 0xb, 0x10, + STXT("Retired SSE and AVX integer add ops")) }, + { I("sse_avx_ops_retired.sse_avx_aes", 0xb, 0x50, + STXT("Retired SSE and AVX integer AES ops")) }, + { I("sse_avx_ops_retired.sse_avx_all", 0xb, 0xf0, + STXT("Retired SSE and AVX integer ops of all types")) }, + { I("sse_avx_ops_retired.sse_avx_clm", 0xb, 0x80, + STXT("Retired SSE and AVX integer CLM ops")) }, + { I("sse_avx_ops_retired.sse_avx_cmp", 0xb, 0x70, + STXT("Retired SSE and AVX integer compare ops")) }, + { I("sse_avx_ops_retired.sse_avx_logical", 0xb, 0xd0, + STXT("Retired SSE and AVX integer logical ops")) }, + { I("sse_avx_ops_retired.sse_avx_mac", 0xb, 0x40, + STXT("Retired SSE and AVX integer multiply-accumulate ops")) }, + { I("sse_avx_ops_retired.sse_avx_mov", 0xb, 0xa0, + STXT("Retired SSE and AVX integer MOV ops")) }, + { I("sse_avx_ops_retired.sse_avx_mul", 0xb, 0x30, + STXT("Retired SSE and AVX integer multiply ops")) }, + { I("sse_avx_ops_retired.sse_avx_other", 0xb, 0xe0, + STXT("Retired SSE and AVX integer ops of other types")) }, + { I("sse_avx_ops_retired.sse_avx_pack", 0xb, 0xc0, + STXT("Retired SSE and AVX integer pack ops")) }, + { I("sse_avx_ops_retired.sse_avx_sha", 0xb, 0x60, + STXT("Retired SSE and AVX integer SHA ops")) }, + { I("sse_avx_ops_retired.sse_avx_shift", 0xb, 0x90, + STXT("Retired SSE and AVX integer shift ops")) }, + { I("sse_avx_ops_retired.sse_avx_shuffle", 0xb, 0xb0, + STXT("Retired SSE and AVX integer shuffle ops (may include instructions not" + "necessarily thought of as including shuffles e.g. horizontal add, dot" + "product, and certain MOV instructions)")) }, + { I("sse_avx_ops_retired.sse_avx_sub", 0xb, 0x20, + STXT("Retired SSE and AVX integer subtract ops")) }, +/* memory: */ + { I("bp_l1_tlb_fetch_hit.all", 0x94, 0x7, + STXT("Instruction fetches that hit in the L1 ITLB for all page sizes")) }, + { I("bp_l1_tlb_fetch_hit.if1g", 0x94, 0x4, + STXT("Instruction fetches that hit in the L1 ITLB for 1G pages")) }, + { I("bp_l1_tlb_fetch_hit.if2m", 0x94, 0x2, + STXT("Instruction fetches that hit in the L1 ITLB for 2M pages")) }, + { I("bp_l1_tlb_fetch_hit.if4k", 0x94, 0x1, + STXT("Instruction fetches that hit in the L1 ITLB for 4k or coalesced pages." + "A coalesced page is a 16k page created from four adjacent 4k pages")) }, + { I("bp_l1_tlb_miss_l2_tlb_hit", 0x84, 0, + STXT("Instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB")) }, + { I("bp_l1_tlb_miss_l2_tlb_miss.all", 0x85, 0xf, + STXT("Instruction fetches that miss in both the L1 and L2 ITLBs (page-table" + "walks are requested) for all page sizes")) }, + { I("bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k", 0x85, 0x8, + STXT("Instruction fetches that miss in both the L1 and L2 ITLBs (page-table" + "walks are requested) for coalesced pages. A coalesced page is a 16k" + "page created from four adjacent 4k pages")) }, + { I("bp_l1_tlb_miss_l2_tlb_miss.if1g", 0x85, 0x4, + STXT("Instruction fetches that miss in both the L1 and L2 ITLBs (page-table" + "walks are requested) for 1G pages")) }, + { I("bp_l1_tlb_miss_l2_tlb_miss.if2m", 0x85, 0x2, + STXT("Instruction fetches that miss in both the L1 and L2 ITLBs (page-table" + "walks are requested) for 2M pages")) }, + { I("bp_l1_tlb_miss_l2_tlb_miss.if4k", 0x85, 0x1, + STXT("Instruction fetches that miss in both the L1 and L2 ITLBs (page-table" + "walks are requested) for 4k pages")) }, + { I("ls_bad_status2.stli_other", 0x24, 0x2, + STXT("Store-to-load conflicts (load unable to complete due to a" + "non-forwardable conflict with an older store)")) }, + { I("ls_dispatch.ld_dispatch", 0x29, 0x1, + STXT("Number of memory load operations dispatched to the load-store unit")) }, + { I("ls_dispatch.ld_st_dispatch", 0x29, 0x4, + STXT("Number of memory load-store operations dispatched to the load-store" + "unit")) }, + { I("ls_dispatch.store_dispatch", 0x29, 0x2, + STXT("Number of memory store operations dispatched to the load-store unit")) }, + { I("ls_l1_d_tlb_miss.all", 0x45, 0xff, + STXT("L1 DTLB misses for all page sizes")) }, + { I("ls_l1_d_tlb_miss.all_l2_miss", 0x45, 0xf0, + STXT("L1 DTLB misses with L2 DTLB misses (page-table walks are requested)" + "for all page sizes")) }, + { I("ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit", 0x45, 0x8, + STXT("L1 DTLB misses with L2 DTLB hits for 1G pages")) }, + { I("ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss", 0x45, 0x80, + STXT("L1 DTLB misses with L2 DTLB misses (page-table walks are requested)" + "for 1G pages")) }, + { I("ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit", 0x45, 0x4, + STXT("L1 DTLB misses with L2 DTLB hits for 2M pages")) }, + { I("ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss", 0x45, 0x40, + STXT("L1 DTLB misses with L2 DTLB misses (page-table walks are requested)" + "for 2M pages")) }, + { I("ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit", 0x45, 0x1, + STXT("L1 DTLB misses with L2 DTLB hits for 4k pages")) }, + { I("ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss", 0x45, 0x10, + STXT("L1 DTLB misses with L2 DTLB misses (page-table walks are requested)" + "for 4k pages")) }, + { I("ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit", 0x45, 0x2, + STXT("L1 DTLB misses with L2 DTLB hits for coalesced pages. A coalesced page" + "is a 16k page created from four adjacent 4k pages")) }, + { I("ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss", 0x45, 0x20, + STXT("L1 DTLB misses with L2 DTLB misses (page-table walks are requested)" + "for coalesced pages. A coalesced page is a 16k page created from four" + "adjacent 4k pages")) }, + { I("ls_misal_loads.ma4k", 0x47, 0x2, + STXT("4kB misaligned (page crossing) loads")) }, + { I("ls_misal_loads.ma64", 0x47, 0x1, + STXT("64B misaligned (cacheline crossing) loads")) }, + { I("ls_st_commit_cancel2.st_commit_cancel_wcb_full", 0x37, 0x1, + STXT("Non-cacheable store commits cancelled due to the non-cacheable commit" + "buffer being full")) }, + { I("ls_stlf", 0x35, 0, STXT("Store-to-load-forward (STLF) hits")) }, + { I("ls_tlb_flush.all", 0x78, 0xff, STXT("All TLB Flushes")) }, +/* other: */ + { I("de_dis_dispatch_token_stalls1.fp_flush_recovery_stall", 0xae, 0x80, + STXT("Number of cycles dispatch is stalled for floating-point flush recovery")) }, + { I("de_dis_dispatch_token_stalls1.fp_reg_file_rsrc_stall", 0xae, 0x20, + STXT("Number of cycles dispatch is stalled for floating-point register file" + "tokens")) }, + { I("de_dis_dispatch_token_stalls1.fp_sch_rsrc_stall", 0xae, 0x40, + STXT("Number of cycles dispatch is stalled for floating-point scheduler" + "tokens")) }, + { I("de_dis_dispatch_token_stalls1.int_phy_reg_file_rsrc_stall", 0xae, 0x1, + STXT("Number of cycles dispatch is stalled for integer physical register" + "file tokens")) }, + { I("de_dis_dispatch_token_stalls1.load_queue_rsrc_stall", 0xae, 0x2, + STXT("Number of cycles dispatch is stalled for Load queue token")) }, + { I("de_dis_dispatch_token_stalls1.store_queue_rsrc_stall", 0xae, 0x4, + STXT("Number of cycles dispatch is stalled for store queue tokens")) }, + { I("de_dis_dispatch_token_stalls1.taken_brnch_buffer_rsrc", 0xae, 0x10, + STXT("Number of cycles dispatch is stalled for taken branch buffer tokens")) }, + { I("de_dis_dispatch_token_stalls2.int_sch0_token_stall", 0xaf, 0x1, + STXT("Number of cycles dispatch is stalled for integer scheduler queue 0" + "tokens")) }, + { I("de_dis_dispatch_token_stalls2.int_sch1_token_stall", 0xaf, 0x2, + STXT("Number of cycles dispatch is stalled for integer scheduler queue 1" + "tokens")) }, + { I("de_dis_dispatch_token_stalls2.int_sch2_token_stall", 0xaf, 0x4, + STXT("Number of cycles dispatch is stalled for integer scheduler queue 2" + "tokens")) }, + { I("de_dis_dispatch_token_stalls2.int_sch3_token_stall", 0xaf, 0x8, + STXT("Number of cycles dispatch is stalled for integer scheduler queue 3" + "tokens")) }, + { I("de_dis_dispatch_token_stalls2.retire_token_stall", 0xaf, 0x20, + STXT("Number of cycles dispatch is stalled for retire queue tokens")) }, + { I("de_dis_ops_from_decoder.any_fp_dispatch", 0xab, 0x4, + STXT("Number of ops dispatched to the floating-point unit")) }, + { I("de_dis_ops_from_decoder.disp_op_type.any_integer_dispatch", 0xab, 0x8, + STXT("Number of ops dispatched to the integer execution unit")) }, + { I("de_no_dispatch_per_slot.backend_stalls", 0x1a0, 0x1e, + STXT("In each cycle counts ops unable to dispatch because of back-end stalls")) }, + { I("de_no_dispatch_per_slot.no_ops_from_frontend", 0x1a0, 0x1, + STXT("In each cycle counts dispatch slots left empty because the front-end" + "did not supply ops")) }, + { I("de_no_dispatch_per_slot.smt_contention", 0x1a0, 0x60, + STXT("In each cycle counts ops unable to dispatch because the dispatch cycle" + "was granted to the other SMT thread")) }, + { I("de_op_queue_empty", 0xa9, 0, + STXT("Cycles when the op queue is empty. Such cycles indicate that the" + "front-end is not delivering instructions fast enough")) }, + { I("de_src_op_disp.all", 0xaa, 0x7, + STXT("Ops dispatched from any source")) }, + { I("de_src_op_disp.decoder", 0xaa, 0x1, + STXT("Ops fetched from instruction cache and dispatched")) }, + { I("de_src_op_disp.loop_buffer", 0xaa, 0x4, + STXT("Ops dispatched from loop buffer")) }, + { I("de_src_op_disp.op_cache", 0xaa, 0x2, + STXT("Ops fetched from op cache and dispatched")) }, + { I("resyncs_or_nc_redirects", 0x96, 0, + STXT("Pipeline restarts not caused by branch mispredicts")) }, +/* recommended: */ + { I("all_data_cache_accesses", 0x29, 0x7, STXT("All data cache accesses")) }, + { NULL, NULL, 0, NULL } +}; + +#undef I +#endif + -- cgit v1.1