diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2020-10-09 15:48:04 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2020-10-09 15:48:04 +0100 |
commit | 4a7c0bd9dcb08798c6f82e55b5a3423f7ee669f1 (patch) | |
tree | c13ace9bf9ea4811c3e97cdc381bb7ee0956eacb /hw/ppc/spapr_numa.c | |
parent | e1c30c43cd0bcb5c7a0877c7aa9ddc8f4a99afbc (diff) | |
parent | 307e7a34dc474c050f345eeb519d957a42f10c77 (diff) | |
download | qemu-4a7c0bd9dcb08798c6f82e55b5a3423f7ee669f1.zip qemu-4a7c0bd9dcb08798c6f82e55b5a3423f7ee669f1.tar.gz qemu-4a7c0bd9dcb08798c6f82e55b5a3423f7ee669f1.tar.bz2 |
Merge remote-tracking branch 'remotes/dgibson/tags/ppc-for-5.2-20201009' into staging
ppc patch queue 2020-10-09
Here's the next set of ppc related patches for qemu-5.2. There are
two main things here:
* Cleanups to error handling in spapr from Greg Kurz
* Improvements to NUMA handling for spapr from Daniel Barboza
There are also a handful of other bugfixes.
# gpg: Signature made Fri 09 Oct 2020 07:02:29 BST
# gpg: using RSA key 75F46586AE61A66CC44E87DC6C38CACA20D9B392
# gpg: Good signature from "David Gibson <david@gibson.dropbear.id.au>" [full]
# gpg: aka "David Gibson (Red Hat) <dgibson@redhat.com>" [full]
# gpg: aka "David Gibson (ozlabs.org) <dgibson@ozlabs.org>" [full]
# gpg: aka "David Gibson (kernel.org) <dwg@kernel.org>" [unknown]
# Primary key fingerprint: 75F4 6586 AE61 A66C C44E 87DC 6C38 CACA 20D9 B392
* remotes/dgibson/tags/ppc-for-5.2-20201009:
specs/ppc-spapr-numa: update with new NUMA support
spapr_numa: consider user input when defining associativity
spapr_numa: change reference-points and maxdomain settings
spapr_numa: forbid asymmetrical NUMA setups
spapr: add spapr_machine_using_legacy_numa() helper
ppc/pnv: Increase max firmware size
spapr: Add a return value to spapr_check_pagesize()
spapr: Add a return value to spapr_nvdimm_validate()
spapr: Simplify error handling in spapr_cpu_core_realize()
spapr: Add a return value to spapr_set_vcpu_id()
spapr: Simplify error handling in prop_get_fdt()
spapr: Add a return value to spapr_drc_attach()
spapr: Simplify error handling in spapr_vio_busdev_realize()
spapr: Simplify error handling in do_client_architecture_support()
spapr: Get rid of cas_check_pvr() error reporting
spapr: Simplify error handling in callers of ppc_set_compat()
ppc: Fix return value in cpu_post_load() error path
ppc: Add a return value to ppc_set_compat() and ppc_set_compat_all()
spapr: Fix error leak in spapr_realize_vcpu()
spapr: Handle HPT allocation failure in nested guest
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw/ppc/spapr_numa.c')
-rw-r--r-- | hw/ppc/spapr_numa.c | 185 |
1 files changed, 177 insertions, 8 deletions
diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c index 64fe567..b50796b 100644 --- a/hw/ppc/spapr_numa.c +++ b/hw/ppc/spapr_numa.c @@ -19,12 +19,126 @@ /* Moved from hw/ppc/spapr_pci_nvlink2.c */ #define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) +static bool spapr_numa_is_symmetrical(MachineState *ms) +{ + int src, dst; + int nb_numa_nodes = ms->numa_state->num_nodes; + NodeInfo *numa_info = ms->numa_state->nodes; + + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = src; dst < nb_numa_nodes; dst++) { + if (numa_info[src].distance[dst] != + numa_info[dst].distance[src]) { + return false; + } + } + } + + return true; +} + +/* + * This function will translate the user distances into + * what the kernel understand as possible values: 10 + * (local distance), 20, 40, 80 and 160, and return the equivalent + * NUMA level for each. Current heuristic is: + * - local distance (10) returns numa_level = 0x4, meaning there is + * no rounding for local distance + * - distances between 11 and 30 inclusive -> rounded to 20, + * numa_level = 0x3 + * - distances between 31 and 60 inclusive -> rounded to 40, + * numa_level = 0x2 + * - distances between 61 and 120 inclusive -> rounded to 80, + * numa_level = 0x1 + * - everything above 120 returns numa_level = 0 to indicate that + * there is no match. This will be calculated as disntace = 160 + * by the kernel (as of v5.9) + */ +static uint8_t spapr_numa_get_numa_level(uint8_t distance) +{ + if (distance == 10) { + return 0x4; + } else if (distance > 11 && distance <= 30) { + return 0x3; + } else if (distance > 31 && distance <= 60) { + return 0x2; + } else if (distance > 61 && distance <= 120) { + return 0x1; + } + + return 0; +} + +static void spapr_numa_define_associativity_domains(SpaprMachineState *spapr) +{ + MachineState *ms = MACHINE(spapr); + NodeInfo *numa_info = ms->numa_state->nodes; + int nb_numa_nodes = ms->numa_state->num_nodes; + int src, dst, i; + + for (src = 0; src < nb_numa_nodes; src++) { + for (dst = src; dst < nb_numa_nodes; dst++) { + /* + * This is how the associativity domain between A and B + * is calculated: + * + * - get the distance D between them + * - get the correspondent NUMA level 'n_level' for D + * - all associativity arrays were initialized with their own + * numa_ids, and we're calculating the distance in node_id + * ascending order, starting from node id 0 (the first node + * retrieved by numa_state). This will have a cascade effect in + * the algorithm because the associativity domains that node 0 + * defines will be carried over to other nodes, and node 1 + * associativities will be carried over after taking node 0 + * associativities into account, and so on. This happens because + * we'll assign assoc_src as the associativity domain of dst + * as well, for all NUMA levels beyond and including n_level. + * + * The PPC kernel expects the associativity domains of node 0 to + * be always 0, and this algorithm will grant that by default. + */ + uint8_t distance = numa_info[src].distance[dst]; + uint8_t n_level = spapr_numa_get_numa_level(distance); + uint32_t assoc_src; + + /* + * n_level = 0 means that the distance is greater than our last + * rounded value (120). In this case there is no NUMA level match + * between src and dst and we can skip the remaining of the loop. + * + * The Linux kernel will assume that the distance between src and + * dst, in this case of no match, is 10 (local distance) doubled + * for each NUMA it didn't match. We have MAX_DISTANCE_REF_POINTS + * levels (4), so this gives us 10*2*2*2*2 = 160. + * + * This logic can be seen in the Linux kernel source code, as of + * v5.9, in arch/powerpc/mm/numa.c, function __node_distance(). + */ + if (n_level == 0) { + continue; + } + + /* + * We must assign all assoc_src to dst, starting from n_level + * and going up to 0x1. + */ + for (i = n_level; i > 0; i--) { + assoc_src = spapr->numa_assoc_array[src][i]; + spapr->numa_assoc_array[dst][i] = assoc_src; + } + } + } + +} + void spapr_numa_associativity_init(SpaprMachineState *spapr, MachineState *machine) { SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); int nb_numa_nodes = machine->numa_state->num_nodes; int i, j, max_nodes_with_gpus; + bool using_legacy_numa = spapr_machine_using_legacy_numa(spapr); /* * For all associativity arrays: first position is the size, @@ -38,6 +152,17 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, for (i = 0; i < nb_numa_nodes; i++) { spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); + + /* + * Fill all associativity domains of non-zero NUMA nodes with + * node_id. This is required because the default value (0) is + * considered a match with associativity domains of node 0. + */ + if (!using_legacy_numa && i != 0) { + for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { + spapr->numa_assoc_array[i][j] = cpu_to_be32(i); + } + } } /* @@ -61,6 +186,23 @@ void spapr_numa_associativity_init(SpaprMachineState *spapr, spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); } + + /* + * Legacy NUMA guests (pseries-5.1 and older, or guests with only + * 1 NUMA node) will not benefit from anything we're going to do + * after this point. + */ + if (using_legacy_numa) { + return; + } + + if (!spapr_numa_is_symmetrical(machine)) { + error_report("Asymmetrical NUMA topologies aren't supported " + "in the pSeries machine"); + exit(EXIT_FAILURE); + } + + spapr_numa_define_associativity_domains(spapr); } void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, @@ -144,24 +286,51 @@ int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, */ void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) { + MachineState *ms = MACHINE(spapr); SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); uint32_t refpoints[] = { cpu_to_be32(0x4), - cpu_to_be32(0x4), + cpu_to_be32(0x3), cpu_to_be32(0x2), + cpu_to_be32(0x1), }; uint32_t nr_refpoints = ARRAY_SIZE(refpoints); - uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0); + uint32_t maxdomain = ms->numa_state->num_nodes + spapr->gpu_numa_id; uint32_t maxdomains[] = { cpu_to_be32(4), - maxdomain, - maxdomain, - maxdomain, - cpu_to_be32(spapr->gpu_numa_id), + cpu_to_be32(maxdomain), + cpu_to_be32(maxdomain), + cpu_to_be32(maxdomain), + cpu_to_be32(maxdomain) }; - if (smc->pre_5_1_assoc_refpoints) { - nr_refpoints = 2; + if (spapr_machine_using_legacy_numa(spapr)) { + uint32_t legacy_refpoints[] = { + cpu_to_be32(0x4), + cpu_to_be32(0x4), + cpu_to_be32(0x2), + }; + uint32_t legacy_maxdomain = spapr->gpu_numa_id > 1 ? 1 : 0; + uint32_t legacy_maxdomains[] = { + cpu_to_be32(4), + cpu_to_be32(legacy_maxdomain), + cpu_to_be32(legacy_maxdomain), + cpu_to_be32(legacy_maxdomain), + cpu_to_be32(spapr->gpu_numa_id), + }; + + G_STATIC_ASSERT(sizeof(legacy_refpoints) <= sizeof(refpoints)); + G_STATIC_ASSERT(sizeof(legacy_maxdomains) <= sizeof(maxdomains)); + + nr_refpoints = 3; + + memcpy(refpoints, legacy_refpoints, sizeof(legacy_refpoints)); + memcpy(maxdomains, legacy_maxdomains, sizeof(legacy_maxdomains)); + + /* pseries-5.0 and older reference-points array is {0x4, 0x4} */ + if (smc->pre_5_1_assoc_refpoints) { + nr_refpoints = 2; + } } _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", |