// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later /* * Interface with the On Chip Controller, * which enforces power and thermal management * * Copyright 2013-2019 IBM Corp. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* OCC Communication Area for PStates */ #define P8_HOMER_OPAL_DATA_OFFSET 0x1F8000 #define P9_HOMER_OPAL_DATA_OFFSET 0x0E2000 #define OPAL_DYNAMIC_DATA_OFFSET 0x0B80 /* relative to HOMER_OPAL_DATA_OFFSET */ #define MAX_PSTATES 256 #define MAX_P8_CORES 12 #define MAX_P9_CORES 24 #define MAX_P10_CORES 32 #define MAX_OPAL_CMD_DATA_LENGTH 4090 #define MAX_OCC_RSP_DATA_LENGTH 8698 #define P8_PIR_CORE_MASK 0xFFF8 #define P9_PIR_QUAD_MASK 0xFFF0 #define P10_PIR_CHIP_MASK 0x0000 #define FREQ_MAX_IN_DOMAIN 0 #define FREQ_MOST_RECENTLY_SET 1 /** * OCC-OPAL Shared Memory Region * * Reference document : * https://github.com/open-power/docs/blob/master/occ/OCC_OpenPwr_FW_Interfaces.pdf * * Supported layout versions: * - 0x01, 0x02 : P8 * https://github.com/open-power/occ/blob/master_p8/src/occ/proc/proc_pstate.h * * - 0x90 : P9 * https://github.com/open-power/occ/blob/master/src/occ_405/proc/proc_pstate.h * In 0x90 the data is separated into :- * -- Static Data (struct occ_pstate_table): Data is written once by OCC * -- Dynamic Data (struct occ_dynamic_data): Data is updated at runtime * * struct occ_pstate_table - Pstate table layout * @valid: Indicates if data is valid * @version: Layout version [Major/Minor] * @v2.throttle: Reason for limiting the max pstate * @v9.occ_role: OCC role (Master/Slave) * @v#.pstate_min: Minimum pstate ever allowed * @v#.pstate_nom: Nominal pstate * @v#.pstate_turbo: Maximum turbo pstate * @v#.pstate_ultra_turbo: Maximum ultra turbo pstate and the maximum * pstate ever allowed * @v#.pstates: Pstate-id and frequency list from Pmax to Pmin * @v#.pstates.id: Pstate-id * @v#.pstates.flags: Pstate-flag(reserved) * @v2.pstates.vdd: Voltage Identifier * @v2.pstates.vcs: Voltage Identifier * @v#.pstates.freq_khz: Frequency in KHz * @v#.core_max[1..N]: Max pstate with N active cores * @spare/reserved/pad: Unused data */ struct occ_pstate_table { u8 valid; u8 version; union __packed { struct __packed { /* Version 0x01 and 0x02 */ u8 throttle; s8 pstate_min; s8 pstate_nom; s8 pstate_turbo; s8 pstate_ultra_turbo; u8 spare; u64 reserved; struct __packed { s8 id; u8 flags; u8 vdd; u8 vcs; __be32 freq_khz; } pstates[MAX_PSTATES]; s8 core_max[MAX_P8_CORES]; u8 pad[100]; } v2; struct __packed { /* Version 0x90 */ u8 occ_role; u8 pstate_min; u8 pstate_nom; u8 pstate_turbo; u8 pstate_ultra_turbo; u8 spare; u64 reserved1; u64 reserved2; struct __packed { u8 id; u8 flags; u16 reserved; __be32 freq_khz; } pstates[MAX_PSTATES]; u8 core_max[MAX_P9_CORES]; u8 pad[56]; } v9; struct __packed { /* Version 0xA0 */ u8 occ_role; u8 pstate_min; u8 pstate_fixed_freq; u8 pstate_base; u8 pstate_ultra_turbo; u8 pstate_fmax; u8 minor; u8 pstate_bottom_throttle; u8 spare; u8 spare1; u32 reserved_32; u64 reserved_64; struct __packed { u8 id; u8 valid; u16 reserved; __be32 freq_khz; } pstates[MAX_PSTATES]; u8 core_max[MAX_P10_CORES]; u8 pad[48]; } v10; }; } __packed; /** * OPAL-OCC Command Response Interface * * OPAL-OCC Command Buffer * * --------------------------------------------------------------------- * | OPAL | Cmd | OPAL | | Cmd Data | Cmd Data | OPAL | * | Cmd | Request | OCC | Reserved | Length | Length | Cmd | * | Flags | ID | Cmd | | (MSB) | (LSB) | Data... | * --------------------------------------------------------------------- * | ….OPAL Command Data up to max of Cmd Data Length 4090 bytes | * | | * --------------------------------------------------------------------- * * OPAL Command Flag * * ----------------------------------------------------------------- * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 | * | (msb) | | | | | | | (lsb) | * ----------------------------------------------------------------- * |Cmd | | | | | | | | * |Ready | | | | | | | | * ----------------------------------------------------------------- * * struct opal_command_buffer - Defines the layout of OPAL command buffer * @flag: Provides general status of the command * @request_id: Token to identify request * @cmd: Command sent * @data_size: Command data length * @data: Command specific data * @spare: Unused byte */ struct opal_command_buffer { u8 flag; u8 request_id; u8 cmd; u8 spare; u16 data_size; u8 data[MAX_OPAL_CMD_DATA_LENGTH]; } __packed; /** * OPAL-OCC Response Buffer * * --------------------------------------------------------------------- * | OCC | Cmd | OPAL | Response | Rsp Data | Rsp Data | OPAL | * | Rsp | Request | OCC | Status | Length | Length | Rsp | * | Flags | ID | Cmd | | (MSB) | (LSB) | Data... | * --------------------------------------------------------------------- * | ….OPAL Response Data up to max of Rsp Data Length 8698 bytes | * | | * --------------------------------------------------------------------- * * OCC Response Flag * * ----------------------------------------------------------------- * | Bit 7 | Bit 6 | Bit 5 | Bit 4 | Bit 3 | Bit 2 | Bit 1 | Bit 0 | * | (msb) | | | | | | | (lsb) | * ----------------------------------------------------------------- * | | | | | | |OCC in | Rsp | * | | | | | | |progress|Ready | * ----------------------------------------------------------------- * * struct occ_response_buffer - Defines the layout of OCC response buffer * @flag: Provides general status of the response * @request_id: Token to identify request * @cmd: Command requested * @status: Indicates success/failure status of * the command * @data_size: Response data length * @data: Response specific data */ struct occ_response_buffer { u8 flag; u8 request_id; u8 cmd; u8 status; u16 data_size; u8 data[MAX_OCC_RSP_DATA_LENGTH]; } __packed; /** * OCC-OPAL Shared Memory Interface Dynamic Data Vx90 * * struct occ_dynamic_data - Contains runtime attributes * @occ_state: Current state of OCC * @major_version: Major version number * @minor_version: Minor version number (backwards compatible) * Version 1 indicates GPU presence populated * @gpus_present: Bitmask of GPUs present (on systems where GPU * presence is detected through APSS) * @cpu_throttle: Reason for limiting the max pstate * @mem_throttle: Reason for throttling memory * @quick_pwr_drop: Indicates if QPD is asserted * @pwr_shifting_ratio: Indicates the current percentage of power to * take away from the CPU vs GPU when shifting * power to maintain a power cap. Value of 100 * means take all power from CPU. * @pwr_cap_type: Indicates type of power cap in effect * @hard_min_pwr_cap: Hard minimum system power cap in Watts. * Guaranteed unless hardware failure * @max_pwr_cap: Maximum allowed system power cap in Watts * @cur_pwr_cap: Current system power cap * @soft_min_pwr_cap: Soft powercap minimum. OCC may or may not be * able to maintain this * @spare/reserved: Unused data * @cmd: Opal Command Buffer * @rsp: OCC Response Buffer */ struct occ_dynamic_data { u8 occ_state; u8 major_version; u8 minor_version; u8 gpus_present; struct __packed { /* Version 0x90 */ u8 spare1; } v9; struct __packed { /* Version 0xA0 */ u8 wof_enabled; } v10; u8 cpu_throttle; u8 mem_throttle; u8 quick_pwr_drop; u8 pwr_shifting_ratio; u8 pwr_cap_type; u16 hard_min_pwr_cap; u16 max_pwr_cap; u16 cur_pwr_cap; u16 soft_min_pwr_cap; u8 pad[110]; struct opal_command_buffer cmd; struct occ_response_buffer rsp; } __packed; static bool occ_reset; static struct lock occ_lock = LOCK_UNLOCKED; static unsigned long homer_opal_data_offset; DEFINE_LOG_ENTRY(OPAL_RC_OCC_PSTATE_INIT, OPAL_PLATFORM_ERR_EVT, OPAL_OCC, OPAL_CEC_HARDWARE, OPAL_INFO, OPAL_NA); DEFINE_LOG_ENTRY(OPAL_RC_OCC_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_OCC, OPAL_CEC_HARDWARE, OPAL_UNRECOVERABLE_ERR_GENERAL, OPAL_NA); /* * POWER9 and newer platforms have pstate values which are unsigned * positive values. They are continuous set of unsigned integers * [0 to +N] where Pmax is 0 and Pmin is N. The linear ordering of * pstates for P9 has changed compared to P8. Where P8 has negative * pstate values advertised as [0 to -N] where Pmax is 0 and * Pmin is -N. The following routine helps to abstract pstate * comparison with pmax and perform sanity checks on pstate limits. */ /** * cmp_pstates: Compares the given two pstates and determines which * among them is associated with a higher pstate. * * @a,@b: The pstate ids of the pstates being compared. * * Returns: -1 : If pstate associated with @a is smaller than * the pstate associated with @b. * 0 : If pstates associated with @a and @b are equal. * 1 : If pstate associated with @a is greater than * the pstate associated with @b. */ static int cmp_pstates(int a, int b) { /* P8 has 0 to -N (pmax to pmin), P9 has 0 to +N (pmax to pmin) */ if (a > b) return (proc_gen == proc_gen_p8)? 1 : -1; else if (a < b) return (proc_gen == proc_gen_p8)? -1 : 1; return 0; } static inline struct occ_pstate_table *get_occ_pstate_table(struct proc_chip *chip) { return (struct occ_pstate_table *) (chip->homer_base + homer_opal_data_offset); } static inline struct occ_dynamic_data *get_occ_dynamic_data(struct proc_chip *chip) { return (struct occ_dynamic_data *) (chip->homer_base + homer_opal_data_offset + OPAL_DYNAMIC_DATA_OFFSET); } /* * On Chips which have at least one active EX unit, check the * HOMER area for pstate-table valid bit on versions 0x1 and 0x2, or * HOMER dynamic area occ_state on version 0x90. */ static bool wait_for_all_occ_init(void) { struct proc_chip *chip; struct dt_node *xn; struct occ_pstate_table *occ_data; struct occ_dynamic_data *occ_dyn_data; int tries; uint64_t start_time, end_time; uint32_t timeout = 0; if (platform.occ_timeout) timeout = platform.occ_timeout(); start_time = mftb(); for_each_chip(chip) { u8 version; /* * If the chip doesn't any EX unit present, then OCC * will not update the pstate-table. So, skip the * check. */ if (!chip->ex_present) { prlog(PR_DEBUG, "OCC: Chip %02x has no active EX units. Skipping check\n", chip->id); continue; } /* Check for valid homer address */ if (!chip->homer_base) { /** * @fwts-label OCCInvalidHomerBase * @fwts-advice The HOMER base address for a chip * was not valid. This means that OCC (On Chip * Controller) will be non-functional and CPU * frequency scaling will not be functional. CPU may * be set to a safe, low frequency. Power savings in * CPU idle or CPU hotplug may be impacted. */ prlog(PR_ERR,"OCC: Chip: %x homer_base is not valid\n", chip->id); return false; } /* Get PState table address */ occ_data = get_occ_pstate_table(chip); /* * Wait for the OCC to set an appropriate version bit. * The wait is needed since on some platforms (such P8 * Tuletta), OCC is not loaded before OPAL boot. Hence * initialization can take a while. * * Note: Checking for occ_data->version == (0x01/0x02/0x90/0xA0) * is ok because we clear all of * homer_base+size before passing memory to host * services. This ensures occ_data->version == 0x0 * before OCC load. */ tries = timeout * 10; while (tries--) { version = occ_data->version; if (version == 0x01 || version == 0x02 || version == 0x90 || version == 0xA0) break; time_wait_ms(100); } version = occ_data->version; switch (version) { case 0x1: case 0x2: /* * OCC-OPAL interface version 0x1 and 0x2 do not have * the dynamic data. Hence the the only way to figure out * if the OCC is up or not is to check the valid-bit * in the pstate table. */ if (occ_data->valid != 1) { /** * @fwts-label OCCInvalidPStateTable * @fwts-advice The pstate table for a chip * was not valid. This means that OCC (On Chip * Controller) will be non-functional and CPU * frequency scaling will not be functional. CPU may * be set to a low, safe frequency. This means * that CPU idle states and CPU frequency scaling * may not be functional. */ prlog(PR_ERR, "OCC: Chip: %x PState table is not valid\n", chip->id); return false; } break; case 0x90: /* * OCC-OPAL interface version 0x90 has a * dynamic data section. This has an * occ_state field whose values inform about * the state of the OCC. * * 0x00 = OCC not running. No communication * allowed. * * 0x01 = Standby. No communication allowed. * * 0x02 = Observation State. Communication * allowed and is command dependent. * * 0x03 = Active State. Communication allowed * and is command dependent. * * 0x04 = Safe State. No communication * allowed. Just like CPU throttle * status, some failures will not allow * for OCC to update state to safe. * * 0x05 = Characterization State. * Communication allowed and is command * dependent. * * We will error out if OCC is not in the * Active State. * * XXX : Should we error out only if no * communication is allowed with the * OCC ? */ occ_dyn_data = get_occ_dynamic_data(chip); if (occ_dyn_data->occ_state != 0x3) { /** * @fwts-label OCCInactive * @fwts-advice The OCC for a chip was not active. * This means that CPU frequency scaling will * not be functional. CPU may be set to a low, * safe frequency. This means that CPU idle * states and CPU frequency scaling may not be * functional. */ prlog(PR_ERR, "OCC: Chip: %x: OCC not active\n", chip->id); return false; } break; case 0xA0: /* * OCC-OPAL interface version 0x90 has a * dynamic data section. This has an * occ_state field whose values inform about * the state of the OCC. * * 0x00 = OCC not running. No communication * allowed. * * 0x01 = Standby. No communication allowed. * * 0x02 = Observation State. Communication * allowed and is command dependent. * * 0x03 = Active State. Communication allowed * and is command dependent. * * 0x04 = Safe State. No communication * allowed. Just like CPU throttle * status, some failures will not allow * for OCC to update state to safe. * * 0x05 = Characterization State. * Communication allowed and is command * dependent. * * We will error out if OCC is not in the * Active State. * * XXX : Should we error out only if no * communication is allowed with the * OCC ? */ occ_dyn_data = get_occ_dynamic_data(chip); if (occ_dyn_data->occ_state != 0x3) { /** * @fwts-label OCCInactive * @fwts-advice The OCC for a chip was not active. * This means that CPU frequency scaling will * not be functional. CPU may be set to a low, * safe frequency. This means that CPU idle * states and CPU frequency scaling may not be * functional. */ prlog(PR_ERR, "OCC: Chip: %x: OCC not active\n", chip->id); return false; } break; default: prlog(PR_ERR, "OCC: Unknown OCC-OPAL interface version.\n"); return false; } if (!chip->occ_functional) chip->occ_functional = true; prlog(PR_DEBUG, "OCC: Chip %02x Data (%016llx) = %016llx\n", chip->id, (uint64_t)occ_data, be64_to_cpu(*(__be64 *)occ_data)); if (version == 0x90 || version == 0xA0) { occ_dyn_data = get_occ_dynamic_data(chip); prlog(PR_DEBUG, "OCC: Chip %02x Dynamic Data (%016llx) = %016llx\n", chip->id, (uint64_t)occ_dyn_data, be64_to_cpu(*(__be64 *)occ_dyn_data)); } } end_time = mftb(); prlog(PR_NOTICE, "OCC: All Chip Rdy after %lu ms\n", tb_to_msecs(end_time - start_time)); dt_for_each_compatible(dt_root, xn, "ibm,xscom") { const struct dt_property *p; p = dt_find_property(xn, "ibm,occ-functional-state"); if (!p) dt_add_property_cells(xn, "ibm,occ-functional-state", 0x1); } return true; } /* * OCC provides pstate table entries in continuous descending order. * Parse the pstate table to skip pstate_ids that are greater * than Pmax. If a pstate_id is equal to Pmin then add it to * the list and break from the loop as this is the last valid * element in the pstate table. */ static void parse_pstates_v2(struct occ_pstate_table *data, __be32 *dt_id, __be32 *dt_freq, int nr_pstates, int pmax, int pmin) { int i, j; for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) { if (cmp_pstates(data->v2.pstates[i].id, pmax) > 0) continue; dt_id[j] = cpu_to_be32(data->v2.pstates[i].id); dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v2.pstates[i].freq_khz) / 1000); j++; if (data->v2.pstates[i].id == pmin) break; } if (j != nr_pstates) prerror("OCC: Expected pstates(%d) is not equal to parsed pstates(%d)\n", nr_pstates, j); } static void parse_pstates_v9(struct occ_pstate_table *data, __be32 *dt_id, __be32 *dt_freq, int nr_pstates, int pmax, int pmin) { int i, j; for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) { if (cmp_pstates(data->v9.pstates[i].id, pmax) > 0) continue; dt_id[j] = cpu_to_be32(data->v9.pstates[i].id); dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v9.pstates[i].freq_khz) / 1000); j++; if (data->v9.pstates[i].id == pmin) break; } if (j != nr_pstates) prerror("OCC: Expected pstates(%d) is not equal to parsed pstates(%d)\n", nr_pstates, j); } static void parse_pstates_v10(struct occ_pstate_table *data, __be32 *dt_id, __be32 *dt_freq, int nr_pstates, int pmax, int pmin) { int i, j; int invalid = 0; for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) { if (cmp_pstates(data->v10.pstates[i].id, pmax) > 0) continue; if (!data->v10.pstates[i].valid) { prlog(PR_WARNING, "OCC: Found Invalid pstate with index %d. Skipping it.\n", i); invalid++; continue; } dt_id[j] = cpu_to_be32(data->v10.pstates[i].id); dt_freq[j] = cpu_to_be32(be32_to_cpu(data->v10.pstates[i].freq_khz) / 1000); j++; if (data->v10.pstates[i].id == pmin) break; } if ((j + invalid) != nr_pstates) { prerror("OCC: Expected pstates(%d) not equal to (Parsed pstates(%d) + Invalid Pstates (%d))\n", nr_pstates, j, invalid); } } static void parse_vid(struct occ_pstate_table *occ_data, struct dt_node *node, u8 nr_pstates, int pmax, int pmin) { u8 *dt_vdd, *dt_vcs; int i, j; dt_vdd = malloc(nr_pstates); assert(dt_vdd); dt_vcs = malloc(nr_pstates); assert(dt_vcs); for (i = 0, j = 0; i < MAX_PSTATES && j < nr_pstates; i++) { if (cmp_pstates(occ_data->v2.pstates[i].id, pmax) > 0) continue; dt_vdd[j] = occ_data->v2.pstates[i].vdd; dt_vcs[j] = occ_data->v2.pstates[i].vcs; j++; if (occ_data->v2.pstates[i].id == pmin) break; } dt_add_property(node, "ibm,pstate-vdds", dt_vdd, nr_pstates); dt_add_property(node, "ibm,pstate-vcss", dt_vcs, nr_pstates); free(dt_vdd); free(dt_vcs); } /* Add device tree properties to describe pstates states */ /* Return nominal pstate to set in each core */ static bool add_cpu_pstate_properties(struct dt_node *power_mgt, int *pstate_nom) { struct proc_chip *chip; uint64_t occ_data_area; struct occ_pstate_table *occ_data = NULL; struct occ_dynamic_data *occ_dyn_data; /* Arrays for device tree */ __be32 *dt_id, *dt_freq; int pmax, pmin, pnom; u8 nr_pstates; bool ultra_turbo_supported; int i, major, minor; prlog(PR_DEBUG, "OCC: CPU pstate state device tree init\n"); /* * Find first chip with an OCC which has as a valid * pstate-table */ for_each_chip(chip) { occ_data = get_occ_pstate_table(chip); /* Dump first 16 bytes of PState table */ occ_data_area = (uint64_t)occ_data; prlog(PR_DEBUG, "OCC: Chip %02d :Data (%16llx) = %16llx %16llx\n", chip->id, occ_data_area, be64_to_cpu(*(__be64 *)occ_data_area), be64_to_cpu(*(__be64 *)(occ_data_area + 8))); if (occ_data->valid) break; /* * XXX : Error out if !occ_data->valid but Chip has at * least one EX Unit? */ } assert(occ_data); if (!occ_data->valid) { /** * @fwts-label OCCInvalidPStateTableDT * @fwts-advice The pstate tables for none of the chips * are valid. This means that OCC (On Chip * Controller) will be non-functional. This means * that CPU idle states and CPU frequency scaling * will not be functional as OPAL doesn't populate * the device tree with pstates in this case. */ prlog(PR_ERR, "OCC: PState table is not valid\n"); return false; } /* * Workload-Optimized-Frequency(WOF) or Ultra-Turbo is supported * from version 0x02 onwards. If WOF is disabled then, the max * ultra_turbo pstate will be equal to max turbo pstate. */ ultra_turbo_supported = true; major = occ_data->version >> 4; minor = occ_data->version & 0xF; /* Parse Pmax, Pmin and Pnominal */ switch (major) { case 0: if (proc_gen >= proc_gen_p9) { /** * @fwts-label OCCInvalidVersion02 * @fwts-advice The PState table layout version is not * supported in P9. So OPAL will not parse the PState * table. CPU frequency scaling will not be functional * as frequency and pstate-ids are not added to DT. */ prerror("OCC: Version %x is not supported in P9\n", occ_data->version); return false; } if (minor == 0x1) ultra_turbo_supported = false; pmin = occ_data->v2.pstate_min; pnom = occ_data->v2.pstate_nom; if (ultra_turbo_supported) pmax = occ_data->v2.pstate_ultra_turbo; else pmax = occ_data->v2.pstate_turbo; break; case 0x9: if (proc_gen == proc_gen_p8) { /** * @fwts-label OCCInvalidVersion90 * @fwts-advice The PState table layout version is not * supported in P8. So OPAL will not parse the PState * table. CPU frequency scaling will not be functional * as frequency and pstate-ids are not added to DT. */ prerror("OCC: Version %x is not supported in P8\n", occ_data->version); return false; } pmin = occ_data->v9.pstate_min; pnom = occ_data->v9.pstate_nom; pmax = occ_data->v9.pstate_ultra_turbo; break; case 0xA: pmin = occ_data->v10.pstate_min; pnom = occ_data->v10.pstate_fixed_freq; occ_dyn_data = get_occ_dynamic_data(chip); if (occ_dyn_data->v10.wof_enabled) pmax = occ_data->v10.pstate_ultra_turbo; else pmax = occ_data->v10.pstate_fmax; break; default: /** * @fwts-label OCCUnsupportedVersion * @fwts-advice The PState table layout version is not * supported. So OPAL will not parse the PState table. * CPU frequency scaling will not be functional as OPAL * doesn't populate the device tree with pstates. */ prerror("OCC: Unsupported pstate table layout version %d\n", occ_data->version); return false; } /* Sanity check for pstate limits */ if (cmp_pstates(pmin, pmax) > 0) { /** * @fwts-label OCCInvalidPStateLimits * @fwts-advice The min pstate is greater than the * max pstate, this could be due to corrupted/invalid * data in OCC-OPAL shared memory region. So OPAL has * not added pstates to device tree. This means that * CPU Frequency management will not be functional in * the host. */ prerror("OCC: Invalid pstate limits. Pmin(%d) > Pmax (%d)\n", pmin, pmax); return false; } if (cmp_pstates(pnom, pmax) > 0) { /** * @fwts-label OCCInvalidNominalPState * @fwts-advice The nominal pstate is greater than the * max pstate, this could be due to corrupted/invalid * data in OCC-OPAL shared memory region. So OPAL has * limited the nominal pstate to max pstate. */ prerror("OCC: Clipping nominal pstate(%d) to Pmax(%d)\n", pnom, pmax); pnom = pmax; } nr_pstates = labs(pmax - pmin) + 1; prlog(PR_DEBUG, "OCC: Version %x Min %d Nom %d Max %d Nr States %d\n", occ_data->version, pmin, pnom, pmax, nr_pstates); if (((major == 0x9 || major == 0xA) && nr_pstates <= 1) || (major == 0 && (nr_pstates <= 1 || nr_pstates > 128))) { /** * @fwts-label OCCInvalidPStateRange * @fwts-advice The number of pstates is outside the valid * range (currently <=1 or > 128 on p8, >255 on P9), so OPAL * has not added pstates to the device tree. This means that * OCC (On Chip Controller) will be non-functional. This means * that CPU idle states and CPU frequency scaling * will not be functional. */ prerror("OCC: OCC range is not valid; No of pstates = %d\n", nr_pstates); return false; } dt_id = malloc(nr_pstates * sizeof(__be32)); assert(dt_id); dt_freq = malloc(nr_pstates * sizeof(__be32)); assert(dt_freq); switch (major) { case 0: parse_pstates_v2(occ_data, dt_id, dt_freq, nr_pstates, pmax, pmin); break; case 0x9: parse_pstates_v9(occ_data, dt_id, dt_freq, nr_pstates, pmax, pmin); break; case 0xA: parse_pstates_v10(occ_data, dt_id, dt_freq, nr_pstates, pmax, pmin); break; default: return false; } /* Add the device-tree entries */ dt_add_property(power_mgt, "ibm,pstate-ids", dt_id, nr_pstates * sizeof(__be32)); dt_add_property(power_mgt, "ibm,pstate-frequencies-mhz", dt_freq, nr_pstates * sizeof(__be32)); dt_add_property_cells(power_mgt, "ibm,pstate-min", pmin); dt_add_property_cells(power_mgt, "ibm,pstate-nominal", pnom); dt_add_property_cells(power_mgt, "ibm,pstate-max", pmax); free(dt_freq); free(dt_id); /* * Parse and add WOF properties: turbo, ultra-turbo and core_max array. * core_max[1..n] array provides the max sustainable pstate that can be * achieved with i active cores in the chip. */ if (ultra_turbo_supported) { int pturbo, pultra_turbo; u8 nr_cores = get_available_nr_cores_in_chip(chip->id); __be32 *dt_cmax; dt_cmax = malloc(nr_cores * sizeof(u32)); assert(dt_cmax); switch (major) { case 0: pturbo = occ_data->v2.pstate_turbo; pultra_turbo = occ_data->v2.pstate_ultra_turbo; for (i = 0; i < nr_cores; i++) dt_cmax[i] = cpu_to_be32(occ_data->v2.core_max[i]); break; case 0x9: pturbo = occ_data->v9.pstate_turbo; pultra_turbo = occ_data->v9.pstate_ultra_turbo; for (i = 0; i < nr_cores; i++) dt_cmax[i] = cpu_to_be32(occ_data->v9.core_max[i]); break; case 0xA: pturbo = occ_data->v10.pstate_base; pultra_turbo = occ_data->v10.pstate_ultra_turbo; for (i = 0; i < nr_cores; i++) dt_cmax[i] = cpu_to_be32(occ_data->v10.core_max[i]); break; default: return false; } if (cmp_pstates(pturbo, pmax) > 0) { prerror("OCC: Clipping turbo pstate(%d) to Pmax(%d)\n", pturbo, pmax); dt_add_property_cells(power_mgt, "ibm,pstate-turbo", pmax); } else { dt_add_property_cells(power_mgt, "ibm,pstate-turbo", pturbo); } dt_add_property_cells(power_mgt, "ibm,pstate-ultra-turbo", pultra_turbo); dt_add_property(power_mgt, "ibm,pstate-core-max", dt_cmax, nr_cores * sizeof(u32)); dt_add_property_cells(power_mgt, "ibm,pstate-base", pturbo); free(dt_cmax); } if (major == 0x9 || major == 0xA) goto out; dt_add_property_cells(power_mgt, "#address-cells", 2); dt_add_property_cells(power_mgt, "#size-cells", 1); /* Add chip specific pstate properties */ for_each_chip(chip) { struct dt_node *occ_node; occ_data = get_occ_pstate_table(chip); occ_node = dt_new_addr(power_mgt, "occ", (uint64_t)occ_data); if (!occ_node) { /** * @fwts-label OCCDTFailedNodeCreation * @fwts-advice Failed to create * /ibm,opal/power-mgt/occ. Per-chip pstate properties * are not added to Device Tree. */ prerror("OCC: Failed to create /ibm,opal/power-mgt/occ@%llx\n", (uint64_t)occ_data); return false; } dt_add_property_cells(occ_node, "reg", hi32((uint64_t)occ_data), lo32((uint64_t)occ_data), OPAL_DYNAMIC_DATA_OFFSET + sizeof(struct occ_dynamic_data)); dt_add_property_cells(occ_node, "ibm,chip-id", chip->id); /* * Parse and add pstate Voltage Identifiers (VID) to DT which * are provided by OCC in version 0x01 and 0x02 */ parse_vid(occ_data, occ_node, nr_pstates, pmax, pmin); } out: /* Return pstate to set for each core */ *pstate_nom = pnom; return true; } /* * Prepare chip for pstate transitions */ static bool cpu_pstates_prepare_core(struct proc_chip *chip, struct cpu_thread *c, int pstate_nom) { uint32_t core = pir_to_core_id(c->pir); uint64_t tmp, pstate; int rc; /* * Currently Fastsleep init clears EX_PM_SPR_OVERRIDE_EN. * Need to ensure only relevant bits are inited */ /* Init PM GP1 for SCOM based PSTATE control to set nominal freq * * Use the OR SCOM to set the required bits in PM_GP1 register * since the OCC might be mainpulating the PM_GP1 register as well. */ rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_SET_GP1), EX_PM_SETUP_GP1_PM_SPR_OVERRIDE_EN); if (rc) { log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), "OCC: Failed to write PM_GP1 in pstates init\n"); return false; } /* Set new pstate to core */ rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMCR), &tmp); if (rc) { log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), "OCC: Failed to read PM_PPMCR from OCC in pstates init\n"); return false; } tmp = tmp & ~0xFFFF000000000000ULL; pstate = ((uint64_t) pstate_nom) & 0xFF; tmp = tmp | (pstate << 56) | (pstate << 48); rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMCR), tmp); if (rc) { log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), "OCC: Failed to write PM_PPMCR in pstates init\n"); return false; } time_wait_ms(1); /* Wait for PState to change */ /* * Init PM GP1 for SPR based PSTATE control. * Once OCC is active EX_PM_SETUP_GP1_DPLL_FREQ_OVERRIDE_EN will be * cleared by OCC. Sapphire need not clear. * However wait for DVFS state machine to become idle after min->nominal * transition initiated above. If not switch over to SPR control could fail. * * Use the AND SCOM to clear the required bits in PM_GP1 register * since the OCC might be mainpulating the PM_GP1 register as well. */ tmp = ~EX_PM_SETUP_GP1_PM_SPR_OVERRIDE_EN; rc = xscom_write(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_CLEAR_GP1), tmp); if (rc) { log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), "OCC: Failed to write PM_GP1 in pstates init\n"); return false; } /* Just debug */ rc = xscom_read(chip->id, XSCOM_ADDR_P8_EX_SLAVE(core, EX_PM_PPMSR), &tmp); if (rc) { log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), "OCC: Failed to read PM_PPMSR from OCC" "in pstates init\n"); return false; } prlog(PR_DEBUG, "OCC: Chip %x Core %x PPMSR %016llx\n", chip->id, core, tmp); /* * If PMSR is still in transition at this point due to PState change * initiated above, then the switchover to SPR may not work. * ToDo: Check for DVFS state machine idle before change. */ return true; } static bool occ_opal_msg_outstanding = false; static void occ_msg_consumed(void *data __unused, int status __unused) { lock(&occ_lock); occ_opal_msg_outstanding = false; unlock(&occ_lock); } static inline u8 get_cpu_throttle(struct proc_chip *chip) { struct occ_pstate_table *pdata = get_occ_pstate_table(chip); struct occ_dynamic_data *data; switch (pdata->version >> 4) { case 0: return pdata->v2.throttle; case 0x9: case 0xA: data = get_occ_dynamic_data(chip); return data->cpu_throttle; default: return 0; }; } bool is_occ_reset(void) { return occ_reset; } static void occ_throttle_poll(void *data __unused) { struct proc_chip *chip; struct occ_pstate_table *occ_data; struct opal_occ_msg occ_msg; int rc; if (!try_lock(&occ_lock)) return; if (occ_reset) { int inactive = 0; for_each_chip(chip) { occ_data = get_occ_pstate_table(chip); if (occ_data->valid != 1) { inactive = 1; break; } } if (!inactive) { /* * Queue OCC_THROTTLE with throttle status as 0 to * indicate all OCCs are active after a reset. */ occ_msg.type = cpu_to_be64(OCC_THROTTLE); occ_msg.chip = 0; occ_msg.throttle_status = 0; rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL, sizeof(struct opal_occ_msg), &occ_msg); if (!rc) occ_reset = false; } } else { if (occ_opal_msg_outstanding) goto done; for_each_chip(chip) { u8 throttle; occ_data = get_occ_pstate_table(chip); throttle = get_cpu_throttle(chip); if ((occ_data->valid == 1) && (chip->throttle != throttle) && (throttle <= OCC_MAX_THROTTLE_STATUS)) { occ_msg.type = cpu_to_be64(OCC_THROTTLE); occ_msg.chip = cpu_to_be64(chip->id); occ_msg.throttle_status = cpu_to_be64(throttle); rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, occ_msg_consumed, sizeof(struct opal_occ_msg), &occ_msg); if (!rc) { chip->throttle = throttle; occ_opal_msg_outstanding = true; break; } } } } done: unlock(&occ_lock); } /* OPAL-OCC Command/Response Interface */ enum occ_state { OCC_STATE_NOT_RUNNING = 0x00, OCC_STATE_STANDBY = 0x01, OCC_STATE_OBSERVATION = 0x02, OCC_STATE_ACTIVE = 0x03, OCC_STATE_SAFE = 0x04, OCC_STATE_CHARACTERIZATION = 0x05, }; enum occ_role { OCC_ROLE_SLAVE = 0x0, OCC_ROLE_MASTER = 0x1, }; enum occ_cmd { OCC_CMD_CLEAR_SENSOR_DATA, OCC_CMD_SET_POWER_CAP, OCC_CMD_SET_POWER_SHIFTING_RATIO, OCC_CMD_SELECT_SENSOR_GROUP, }; struct opal_occ_cmd_info { enum occ_cmd cmd; u8 cmd_value; u16 cmd_size; u16 rsp_size; int timeout_ms; u16 state_mask; u8 role_mask; }; static struct opal_occ_cmd_info occ_cmds[] = { { OCC_CMD_CLEAR_SENSOR_DATA, 0xD0, 4, 4, 1000, PPC_BIT16(OCC_STATE_OBSERVATION) | PPC_BIT16(OCC_STATE_ACTIVE) | PPC_BIT16(OCC_STATE_CHARACTERIZATION), PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE) }, { OCC_CMD_SET_POWER_CAP, 0xD1, 2, 2, 1000, PPC_BIT16(OCC_STATE_OBSERVATION) | PPC_BIT16(OCC_STATE_ACTIVE) | PPC_BIT16(OCC_STATE_CHARACTERIZATION), PPC_BIT8(OCC_ROLE_MASTER) }, { OCC_CMD_SET_POWER_SHIFTING_RATIO, 0xD2, 1, 1, 1000, PPC_BIT16(OCC_STATE_OBSERVATION) | PPC_BIT16(OCC_STATE_ACTIVE) | PPC_BIT16(OCC_STATE_CHARACTERIZATION), PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE) }, { OCC_CMD_SELECT_SENSOR_GROUP, 0xD3, 2, 2, 1000, PPC_BIT16(OCC_STATE_OBSERVATION) | PPC_BIT16(OCC_STATE_ACTIVE) | PPC_BIT16(OCC_STATE_CHARACTERIZATION), PPC_BIT8(OCC_ROLE_MASTER) | PPC_BIT8(OCC_ROLE_SLAVE) }, }; enum occ_response_status { OCC_RSP_SUCCESS = 0x00, OCC_RSP_INVALID_COMMAND = 0x11, OCC_RSP_INVALID_CMD_DATA_LENGTH = 0x12, OCC_RSP_INVALID_DATA = 0x13, OCC_RSP_INTERNAL_ERROR = 0x15, }; #define OCC_FLAG_RSP_READY 0x01 #define OCC_FLAG_CMD_IN_PROGRESS 0x02 #define OPAL_FLAG_CMD_READY 0x80 struct opal_occ_cmd_data { u8 *data; enum occ_cmd cmd; }; static struct cmd_interface { struct lock queue_lock; struct timer timeout; struct opal_occ_cmd_data *cdata; struct opal_command_buffer *cmd; struct occ_response_buffer *rsp; u8 *occ_state; u8 *valid; u32 chip_id; u32 token; u16 enabled_sensor_mask; u8 occ_role; u8 request_id; bool cmd_in_progress; bool retry; } *chips; static int nr_occs; static inline struct cmd_interface *get_chip_cmd_interface(int chip_id) { int i; for (i = 0; i < nr_occs; i++) if (chips[i].chip_id == chip_id) return &chips[i]; return NULL; } static inline bool occ_in_progress(struct cmd_interface *chip) { return (chip->rsp->flag == OCC_FLAG_CMD_IN_PROGRESS); } static int write_occ_cmd(struct cmd_interface *chip) { struct opal_command_buffer *cmd = chip->cmd; enum occ_cmd ocmd = chip->cdata->cmd; if (!chip->retry && occ_in_progress(chip)) { chip->cmd_in_progress = false; return OPAL_BUSY; } cmd->flag = chip->rsp->flag = 0; cmd->cmd = occ_cmds[ocmd].cmd_value; cmd->request_id = chip->request_id++; cmd->data_size = occ_cmds[ocmd].cmd_size; memcpy(&cmd->data, chip->cdata->data, cmd->data_size); cmd->flag = OPAL_FLAG_CMD_READY; schedule_timer(&chip->timeout, msecs_to_tb(occ_cmds[ocmd].timeout_ms)); return OPAL_ASYNC_COMPLETION; } static int64_t opal_occ_command(struct cmd_interface *chip, int token, struct opal_occ_cmd_data *cdata) { int rc; if (!(*chip->valid) || (!(PPC_BIT16(*chip->occ_state) & occ_cmds[cdata->cmd].state_mask))) return OPAL_HARDWARE; if (!(PPC_BIT8(chip->occ_role) & occ_cmds[cdata->cmd].role_mask)) return OPAL_PERMISSION; lock(&chip->queue_lock); if (chip->cmd_in_progress) { rc = OPAL_BUSY; goto out; } chip->cdata = cdata; chip->token = token; chip->cmd_in_progress = true; chip->retry = false; rc = write_occ_cmd(chip); out: unlock(&chip->queue_lock); return rc; } static inline bool sanity_check_opal_cmd(struct opal_command_buffer *cmd, struct cmd_interface *chip) { return ((cmd->cmd == occ_cmds[chip->cdata->cmd].cmd_value) && (cmd->request_id == chip->request_id - 1) && (cmd->data_size == occ_cmds[chip->cdata->cmd].cmd_size)); } static inline bool check_occ_rsp(struct opal_command_buffer *cmd, struct occ_response_buffer *rsp) { if (cmd->cmd != rsp->cmd) { prlog(PR_DEBUG, "OCC: Command value mismatch in OCC response" "rsp->cmd = %d cmd->cmd = %d\n", rsp->cmd, cmd->cmd); return false; } if (cmd->request_id != rsp->request_id) { prlog(PR_DEBUG, "OCC: Request ID mismatch in OCC response" "rsp->request_id = %d cmd->request_id = %d\n", rsp->request_id, cmd->request_id); return false; } return true; } static inline void queue_occ_rsp_msg(int token, int rc) { int ret; ret = opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, cpu_to_be64(token), cpu_to_be64(rc)); if (ret) prerror("OCC: Failed to queue OCC response status message\n"); } static void occ_cmd_timeout_handler(struct timer *t __unused, void *data, uint64_t now __unused) { struct cmd_interface *chip = data; lock(&chip->queue_lock); if (!chip->cmd_in_progress) goto exit; if (!chip->retry) { prlog(PR_DEBUG, "OCC: Command timeout, retrying\n"); chip->retry = true; write_occ_cmd(chip); } else { chip->cmd_in_progress = false; queue_occ_rsp_msg(chip->token, OPAL_TIMEOUT); prlog(PR_DEBUG, "OCC: Command timeout after retry\n"); } exit: unlock(&chip->queue_lock); } static int read_occ_rsp(struct occ_response_buffer *rsp) { switch (rsp->status) { case OCC_RSP_SUCCESS: return OPAL_SUCCESS; case OCC_RSP_INVALID_COMMAND: prlog(PR_DEBUG, "OCC: Rsp status: Invalid command\n"); break; case OCC_RSP_INVALID_CMD_DATA_LENGTH: prlog(PR_DEBUG, "OCC: Rsp status: Invalid command data length\n"); break; case OCC_RSP_INVALID_DATA: prlog(PR_DEBUG, "OCC: Rsp status: Invalid command data\n"); break; case OCC_RSP_INTERNAL_ERROR: prlog(PR_DEBUG, "OCC: Rsp status: OCC internal error\n"); break; default: break; } /* Clear the OCC response flag */ rsp->flag = 0; return OPAL_INTERNAL_ERROR; } static void handle_occ_rsp(uint32_t chip_id) { struct cmd_interface *chip; struct opal_command_buffer *cmd; struct occ_response_buffer *rsp; chip = get_chip_cmd_interface(chip_id); if (!chip) return; cmd = chip->cmd; rsp = chip->rsp; /*Read rsp*/ if (rsp->flag != OCC_FLAG_RSP_READY) return; lock(&chip->queue_lock); if (!chip->cmd_in_progress) goto exit; cancel_timer(&chip->timeout); if (!sanity_check_opal_cmd(cmd, chip) || !check_occ_rsp(cmd, rsp)) { if (!chip->retry) { prlog(PR_DEBUG, "OCC: Command-response mismatch, retrying\n"); chip->retry = true; write_occ_cmd(chip); } else { chip->cmd_in_progress = false; queue_occ_rsp_msg(chip->token, OPAL_INTERNAL_ERROR); prlog(PR_DEBUG, "OCC: Command-response mismatch\n"); } goto exit; } if (rsp->cmd == occ_cmds[OCC_CMD_SELECT_SENSOR_GROUP].cmd_value && rsp->status == OCC_RSP_SUCCESS) chip->enabled_sensor_mask = *(u16 *)chip->cdata->data; chip->cmd_in_progress = false; queue_occ_rsp_msg(chip->token, read_occ_rsp(chip->rsp)); exit: unlock(&chip->queue_lock); } bool occ_get_gpu_presence(struct proc_chip *chip, int gpu_num) { struct occ_dynamic_data *ddata; static int max_retries = 20; static bool found = false; assert(gpu_num <= 2); ddata = get_occ_dynamic_data(chip); while (!found && max_retries) { if (ddata->major_version == 0 && ddata->minor_version >= 1) { found = true; break; } time_wait_ms(100); max_retries--; ddata = get_occ_dynamic_data(chip); } if (!found) { prlog(PR_INFO, "OCC: No GPU slot presence, assuming GPU present\n"); return true; } return (bool)(ddata->gpus_present & 1 << gpu_num); } static void occ_add_powercap_sensors(struct dt_node *power_mgt); static void occ_add_psr_sensors(struct dt_node *power_mgt); static void occ_cmd_interface_init(void) { struct occ_dynamic_data *data; struct occ_pstate_table *pdata; struct dt_node *power_mgt; struct proc_chip *chip; int i = 0, major; /* Check if the OCC data is valid */ for_each_chip(chip) { pdata = get_occ_pstate_table(chip); if (!pdata->valid) return; } chip = next_chip(NULL); pdata = get_occ_pstate_table(chip); major = pdata->version >> 4; if (major != 0x9 || major != 0xA) return; for_each_chip(chip) nr_occs++; chips = malloc(sizeof(*chips) * nr_occs); assert(chips); for_each_chip(chip) { pdata = get_occ_pstate_table(chip); data = get_occ_dynamic_data(chip); chips[i].chip_id = chip->id; chips[i].occ_state = &data->occ_state; chips[i].valid = &pdata->valid; chips[i].cmd = &data->cmd; chips[i].rsp = &data->rsp; switch (major) { case 0x9: chips[i].occ_role = pdata->v9.occ_role; break; case 0xA: chips[i].occ_role = pdata->v10.occ_role; break; } init_lock(&chips[i].queue_lock); chips[i].cmd_in_progress = false; chips[i].request_id = 0; chips[i].enabled_sensor_mask = OCC_ENABLED_SENSOR_MASK; init_timer(&chips[i].timeout, occ_cmd_timeout_handler, &chips[i]); i++; } power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt"); if (!power_mgt) { prerror("OCC: dt node /ibm,opal/power-mgt not found\n"); return; } /* Add powercap sensors to DT */ occ_add_powercap_sensors(power_mgt); /* Add power-shifting-ratio CPU-GPU sensors to DT */ occ_add_psr_sensors(power_mgt); } /* Powercap interface */ enum sensor_powercap_occ_attr { POWERCAP_OCC_SOFT_MIN, POWERCAP_OCC_MAX, POWERCAP_OCC_CUR, POWERCAP_OCC_HARD_MIN, }; static void occ_add_powercap_sensors(struct dt_node *power_mgt) { struct dt_node *pcap, *node; u32 handle; pcap = dt_new(power_mgt, "powercap"); if (!pcap) { prerror("OCC: Failed to create powercap node\n"); return; } dt_add_property_string(pcap, "compatible", "ibm,opal-powercap"); node = dt_new(pcap, "system-powercap"); if (!node) { prerror("OCC: Failed to create system powercap node\n"); return; } handle = powercap_make_handle(POWERCAP_CLASS_OCC, POWERCAP_OCC_CUR); dt_add_property_cells(node, "powercap-current", handle); handle = powercap_make_handle(POWERCAP_CLASS_OCC, POWERCAP_OCC_SOFT_MIN); dt_add_property_cells(node, "powercap-min", handle); handle = powercap_make_handle(POWERCAP_CLASS_OCC, POWERCAP_OCC_MAX); dt_add_property_cells(node, "powercap-max", handle); handle = powercap_make_handle(POWERCAP_CLASS_OCC, POWERCAP_OCC_HARD_MIN); dt_add_property_cells(node, "powercap-hard-min", handle); } int occ_get_powercap(u32 handle, u32 *pcap) { struct occ_pstate_table *pdata; struct occ_dynamic_data *ddata; struct proc_chip *chip; chip = next_chip(NULL); pdata = get_occ_pstate_table(chip); ddata = get_occ_dynamic_data(chip); if (!pdata->valid) return OPAL_HARDWARE; switch (powercap_get_attr(handle)) { case POWERCAP_OCC_SOFT_MIN: *pcap = ddata->soft_min_pwr_cap; break; case POWERCAP_OCC_MAX: *pcap = ddata->max_pwr_cap; break; case POWERCAP_OCC_CUR: *pcap = ddata->cur_pwr_cap; break; case POWERCAP_OCC_HARD_MIN: *pcap = ddata->hard_min_pwr_cap; break; default: *pcap = 0; return OPAL_UNSUPPORTED; } return OPAL_SUCCESS; } static u16 pcap_cdata; static struct opal_occ_cmd_data pcap_data = { .data = (u8 *)&pcap_cdata, .cmd = OCC_CMD_SET_POWER_CAP, }; int __attribute__((__const__)) occ_set_powercap(u32 handle, int token, u32 pcap) { struct occ_dynamic_data *ddata; struct proc_chip *chip; int i; if (powercap_get_attr(handle) != POWERCAP_OCC_CUR) return OPAL_PERMISSION; if (!chips) return OPAL_HARDWARE; for (i = 0; i < nr_occs; i++) if (chips[i].occ_role == OCC_ROLE_MASTER) break; if (!(*chips[i].valid)) return OPAL_HARDWARE; chip = get_chip(chips[i].chip_id); ddata = get_occ_dynamic_data(chip); if (pcap == ddata->cur_pwr_cap) return OPAL_SUCCESS; if (pcap && (pcap > ddata->max_pwr_cap || pcap < ddata->soft_min_pwr_cap)) return OPAL_PARAMETER; pcap_cdata = pcap; return opal_occ_command(&chips[i], token, &pcap_data); }; /* Power-Shifting Ratio */ enum psr_type { PSR_TYPE_CPU_TO_GPU, /* 0% Cap GPU first, 100% Cap CPU first */ }; int occ_get_psr(u32 handle, u32 *ratio) { struct occ_dynamic_data *ddata; struct proc_chip *chip; u8 i = psr_get_rid(handle); if (psr_get_type(handle) != PSR_TYPE_CPU_TO_GPU) return OPAL_UNSUPPORTED; if (i > nr_occs) return OPAL_UNSUPPORTED; if (!(*chips[i].valid)) return OPAL_HARDWARE; chip = get_chip(chips[i].chip_id); ddata = get_occ_dynamic_data(chip); *ratio = ddata->pwr_shifting_ratio; return OPAL_SUCCESS; } static u8 psr_cdata; static struct opal_occ_cmd_data psr_data = { .data = &psr_cdata, .cmd = OCC_CMD_SET_POWER_SHIFTING_RATIO, }; int occ_set_psr(u32 handle, int token, u32 ratio) { struct occ_dynamic_data *ddata; struct proc_chip *chip; u8 i = psr_get_rid(handle); if (psr_get_type(handle) != PSR_TYPE_CPU_TO_GPU) return OPAL_UNSUPPORTED; if (ratio > 100) return OPAL_PARAMETER; if (i > nr_occs) return OPAL_UNSUPPORTED; if (!(*chips[i].valid)) return OPAL_HARDWARE; chip = get_chip(chips[i].chip_id); ddata = get_occ_dynamic_data(chip); if (ratio == ddata->pwr_shifting_ratio) return OPAL_SUCCESS; psr_cdata = ratio; return opal_occ_command(&chips[i], token, &psr_data); } static void occ_add_psr_sensors(struct dt_node *power_mgt) { struct dt_node *node; int i; node = dt_new(power_mgt, "psr"); if (!node) { prerror("OCC: Failed to create power-shifting-ratio node\n"); return; } dt_add_property_string(node, "compatible", "ibm,opal-power-shift-ratio"); dt_add_property_cells(node, "#address-cells", 1); dt_add_property_cells(node, "#size-cells", 0); for (i = 0; i < nr_occs; i++) { struct dt_node *cnode; char name[20]; u32 handle = psr_make_handle(PSR_CLASS_OCC, i, PSR_TYPE_CPU_TO_GPU); cnode = dt_new_addr(node, "cpu-to-gpu", handle); if (!cnode) { prerror("OCC: Failed to create power-shifting-ratio node\n"); return; } snprintf(name, 20, "cpu_to_gpu_%d", chips[i].chip_id); dt_add_property_string(cnode, "label", name); dt_add_property_cells(cnode, "handle", handle); dt_add_property_cells(cnode, "reg", chips[i].chip_id); } } /* OCC clear sensor limits CSM/Profiler/Job-scheduler */ enum occ_sensor_limit_group { OCC_SENSOR_LIMIT_GROUP_CSM = 0x10, OCC_SENSOR_LIMIT_GROUP_PROFILER = 0x20, OCC_SENSOR_LIMIT_GROUP_JOB_SCHED = 0x40, }; static u32 sensor_limit; static struct opal_occ_cmd_data slimit_data = { .data = (u8 *)&sensor_limit, .cmd = OCC_CMD_CLEAR_SENSOR_DATA, }; int occ_sensor_group_clear(u32 group_hndl, int token) { u32 limit = sensor_get_rid(group_hndl); u8 i = sensor_get_attr(group_hndl); if (i > nr_occs) return OPAL_UNSUPPORTED; switch (limit) { case OCC_SENSOR_LIMIT_GROUP_CSM: case OCC_SENSOR_LIMIT_GROUP_PROFILER: case OCC_SENSOR_LIMIT_GROUP_JOB_SCHED: break; default: return OPAL_UNSUPPORTED; } if (!(*chips[i].valid)) return OPAL_HARDWARE; sensor_limit = limit << 24; return opal_occ_command(&chips[i], token, &slimit_data); } static u16 sensor_enable; static struct opal_occ_cmd_data sensor_mask_data = { .data = (u8 *)&sensor_enable, .cmd = OCC_CMD_SELECT_SENSOR_GROUP, }; int occ_sensor_group_enable(u32 group_hndl, int token, bool enable) { u16 type = sensor_get_rid(group_hndl); u8 i = sensor_get_attr(group_hndl); if (i > nr_occs) return OPAL_UNSUPPORTED; switch (type) { case OCC_SENSOR_TYPE_GENERIC: case OCC_SENSOR_TYPE_CURRENT: case OCC_SENSOR_TYPE_VOLTAGE: case OCC_SENSOR_TYPE_TEMPERATURE: case OCC_SENSOR_TYPE_UTILIZATION: case OCC_SENSOR_TYPE_TIME: case OCC_SENSOR_TYPE_FREQUENCY: case OCC_SENSOR_TYPE_POWER: case OCC_SENSOR_TYPE_PERFORMANCE: break; default: return OPAL_UNSUPPORTED; } if (!(*chips[i].valid)) return OPAL_HARDWARE; if (enable && (type & chips[i].enabled_sensor_mask)) return OPAL_SUCCESS; else if (!enable && !(type & chips[i].enabled_sensor_mask)) return OPAL_SUCCESS; sensor_enable = enable ? type | chips[i].enabled_sensor_mask : ~type & chips[i].enabled_sensor_mask; return opal_occ_command(&chips[i], token, &sensor_mask_data); } void occ_add_sensor_groups(struct dt_node *sg, __be32 *phandles, u32 *ptype, int nr_phandles, int chipid) { struct group_info { int type; const char *str; u32 ops; } groups[] = { { OCC_SENSOR_LIMIT_GROUP_CSM, "csm", OPAL_SENSOR_GROUP_CLEAR }, { OCC_SENSOR_LIMIT_GROUP_PROFILER, "profiler", OPAL_SENSOR_GROUP_CLEAR }, { OCC_SENSOR_LIMIT_GROUP_JOB_SCHED, "js", OPAL_SENSOR_GROUP_CLEAR }, { OCC_SENSOR_TYPE_GENERIC, "generic", OPAL_SENSOR_GROUP_ENABLE }, { OCC_SENSOR_TYPE_CURRENT, "curr", OPAL_SENSOR_GROUP_ENABLE }, { OCC_SENSOR_TYPE_VOLTAGE, "in", OPAL_SENSOR_GROUP_ENABLE }, { OCC_SENSOR_TYPE_TEMPERATURE, "temp", OPAL_SENSOR_GROUP_ENABLE }, { OCC_SENSOR_TYPE_UTILIZATION, "utilization", OPAL_SENSOR_GROUP_ENABLE }, { OCC_SENSOR_TYPE_TIME, "time", OPAL_SENSOR_GROUP_ENABLE }, { OCC_SENSOR_TYPE_FREQUENCY, "frequency", OPAL_SENSOR_GROUP_ENABLE }, { OCC_SENSOR_TYPE_POWER, "power", OPAL_SENSOR_GROUP_ENABLE }, { OCC_SENSOR_TYPE_PERFORMANCE, "performance", OPAL_SENSOR_GROUP_ENABLE }, }; int i, j; /* * Dont add sensor groups if cmd-interface is not intialized */ if (!chips) return; for (i = 0; i < nr_occs; i++) if (chips[i].chip_id == chipid) break; for (j = 0; j < ARRAY_SIZE(groups); j++) { struct dt_node *node; char name[20]; u32 handle; snprintf(name, 20, "occ-%s", groups[j].str); handle = sensor_make_handler(SENSOR_OCC, 0, groups[j].type, i); node = dt_new_addr(sg, name, handle); if (!node) { prerror("Failed to create sensor group nodes\n"); return; } dt_add_property_cells(node, "sensor-group-id", handle); dt_add_property_string(node, "type", groups[j].str); if (groups[j].type == OCC_SENSOR_TYPE_CURRENT || groups[j].type == OCC_SENSOR_TYPE_VOLTAGE || groups[j].type == OCC_SENSOR_TYPE_TEMPERATURE || groups[j].type == OCC_SENSOR_TYPE_POWER) { dt_add_property_string(node, "sensor-type", groups[j].str); dt_add_property_string(node, "compatible", "ibm,opal-sensor"); } dt_add_property_cells(node, "ibm,chip-id", chipid); dt_add_property_cells(node, "reg", handle); if (groups[j].ops == OPAL_SENSOR_GROUP_ENABLE) { __be32 *_phandles; int k, pcount = 0; _phandles = malloc(sizeof(u32) * nr_phandles); assert(_phandles); for (k = 0; k < nr_phandles; k++) if (ptype[k] == groups[j].type) _phandles[pcount++] = phandles[k]; if (pcount) dt_add_property(node, "sensors", _phandles, pcount * sizeof(u32)); free(_phandles); } else { dt_add_property(node, "sensors", phandles, nr_phandles * sizeof(u32)); } dt_add_property_cells(node, "ops", groups[j].ops); } } /* CPU-OCC PState init */ /* Called after OCC init on P8 and P9 */ void occ_pstates_init(void) { struct proc_chip *chip; struct cpu_thread *c; struct dt_node *power_mgt; int pstate_nom; u32 freq_domain_mask; u8 domain_runs_at; static bool occ_pstates_initialized; power_mgt = dt_find_by_path(dt_root, "/ibm,opal/power-mgt"); if (!power_mgt) { /** * @fwts-label OCCDTNodeNotFound * @fwts-advice Device tree node /ibm,opal/power-mgt not * found. OPAL didn't add pstate information to device tree. * Probably a firmware bug. */ prlog(PR_ERR, "OCC: dt node /ibm,opal/power-mgt not found\n"); return; } /* Handle fast reboots */ if (occ_pstates_initialized) { struct dt_node *child; int i; const char *props[] = { "ibm,pstate-core-max", "ibm,pstate-frequencies-mhz", "ibm,pstate-ids", "ibm,pstate-max", "ibm,pstate-min", "ibm,pstate-nominal", "ibm,pstate-turbo", "ibm,pstate-ultra-turbo", "ibm,pstate-base", "#address-cells", "#size-cells", }; for (i = 0; i < ARRAY_SIZE(props); i++) dt_check_del_prop(power_mgt, props[i]); dt_for_each_child(power_mgt, child) if (!strncmp(child->name, "occ", 3)) dt_free(child); } switch (proc_gen) { case proc_gen_p8: homer_opal_data_offset = P8_HOMER_OPAL_DATA_OFFSET; break; case proc_gen_p9: case proc_gen_p10: homer_opal_data_offset = P9_HOMER_OPAL_DATA_OFFSET; break; default: return; } chip = next_chip(NULL); if (!chip->homer_base) { log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), "OCC: No HOMER detected, assuming no pstates\n"); return; } /* Wait for all OCC to boot up */ if(!wait_for_all_occ_init()) { log_simple_error(&e_info(OPAL_RC_OCC_TIMEOUT), "OCC: Initialization on all chips did not complete" "(timed out)\n"); return; } /* * Check boundary conditions and add device tree nodes * and return nominal pstate to set for the core */ if (!add_cpu_pstate_properties(power_mgt, &pstate_nom)) { log_simple_error(&e_info(OPAL_RC_OCC_PSTATE_INIT), "Skiping core cpufreq init due to OCC error\n"); } else if (proc_gen == proc_gen_p8) { /* * Setup host based pstates and set nominal frequency only in * P8. */ for_each_chip(chip) for_each_available_core_in_chip(c, chip->id) cpu_pstates_prepare_core(chip, c, pstate_nom); } if (occ_pstates_initialized) return; /* Add opal_poller to poll OCC throttle status of each chip */ for_each_chip(chip) chip->throttle = 0; opal_add_poller(occ_throttle_poll, NULL); occ_pstates_initialized = true; /* Init OPAL-OCC command-response interface */ occ_cmd_interface_init(); /* TODO Firmware plumbing required so as to have two modes to set * PMCR based on max in domain or most recently used. As of today, * it is always max in domain for P9. */ domain_runs_at = 0; freq_domain_mask = 0; if (proc_gen == proc_gen_p8) { freq_domain_mask = P8_PIR_CORE_MASK; domain_runs_at = FREQ_MOST_RECENTLY_SET; } else if (proc_gen == proc_gen_p9) { freq_domain_mask = P9_PIR_QUAD_MASK; domain_runs_at = FREQ_MAX_IN_DOMAIN; } else if (proc_gen == proc_gen_p10) { freq_domain_mask = P10_PIR_CHIP_MASK; domain_runs_at = FREQ_MAX_IN_DOMAIN; } else { assert(0); } dt_add_property_cells(power_mgt, "freq-domain-mask", freq_domain_mask); dt_add_property_cells(power_mgt, "domain-runs-at", domain_runs_at); } int find_master_and_slave_occ(uint64_t **master, uint64_t **slave, int *nr_masters, int *nr_slaves) { struct proc_chip *chip; int nr_chips = 0, i; uint64_t chipids[MAX_CHIPS]; for_each_chip(chip) { chipids[nr_chips++] = chip->id; } chip = next_chip(NULL); /* * Proc0 is the master OCC for Tuleta/Alpine boxes. * Hostboot expects the pair of chips for MURANO, so pass the sibling * chip id along with proc0 to hostboot. */ *nr_masters = (chip->type == PROC_CHIP_P8_MURANO) ? 2 : 1; *master = (uint64_t *)malloc(*nr_masters * sizeof(uint64_t)); if (!*master) { printf("OCC: master array alloc failure\n"); return -ENOMEM; } if (nr_chips - *nr_masters > 0) { *nr_slaves = nr_chips - *nr_masters; *slave = (uint64_t *)malloc(*nr_slaves * sizeof(uint64_t)); if (!*slave) { printf("OCC: slave array alloc failure\n"); return -ENOMEM; } } for (i = 0; i < nr_chips; i++) { if (i < *nr_masters) { *(*master + i) = chipids[i]; continue; } *(*slave + i - *nr_masters) = chipids[i]; } return 0; } int occ_msg_queue_occ_reset(void) { struct opal_occ_msg occ_msg = { CPU_TO_BE64(OCC_RESET), 0, 0 }; struct proc_chip *chip; int rc; lock(&occ_lock); rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL, sizeof(struct opal_occ_msg), &occ_msg); if (rc) { prlog(PR_INFO, "OCC: Failed to queue OCC_RESET message\n"); goto out; } /* * Set 'valid' byte of occ_pstate_table to 0 since OCC * may not clear this byte on a reset. * OCC will set the 'valid' byte to 1 when it becomes * active again. */ for_each_chip(chip) { struct occ_pstate_table *occ_data; occ_data = get_occ_pstate_table(chip); occ_data->valid = 0; chip->throttle = 0; } occ_reset = true; out: unlock(&occ_lock); return rc; } #define PV_OCC_GP0 0x01000000 #define PV_OCC_GP0_AND 0x01000004 #define PV_OCC_GP0_OR 0x01000005 #define PV_OCC_GP0_PNOR_OWNER PPC_BIT(18) /* 1 = OCC / Host, 0 = BMC */ static void occ_pnor_set_one_owner(uint32_t chip_id, enum pnor_owner owner) { uint64_t reg, mask; if (owner == PNOR_OWNER_HOST) { reg = PV_OCC_GP0_OR; mask = PV_OCC_GP0_PNOR_OWNER; } else { reg = PV_OCC_GP0_AND; mask = ~PV_OCC_GP0_PNOR_OWNER; } xscom_write(chip_id, reg, mask); } void occ_pnor_set_owner(enum pnor_owner owner) { struct proc_chip *chip; for_each_chip(chip) occ_pnor_set_one_owner(chip->id, owner); } #define P8_OCB_OCI_OCCMISC 0x6a020 #define P8_OCB_OCI_OCCMISC_AND 0x6a021 #define P8_OCB_OCI_OCCMISC_OR 0x6a022 #define P9_OCB_OCI_OCCMISC 0x6c080 #define P9_OCB_OCI_OCCMISC_CLEAR 0x6c081 #define P9_OCB_OCI_OCCMISC_OR 0x6c082 #define OCB_OCI_OCIMISC_IRQ PPC_BIT(0) #define OCB_OCI_OCIMISC_IRQ_TMGT PPC_BIT(1) #define OCB_OCI_OCIMISC_IRQ_SLW_TMR PPC_BIT(14) #define OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY PPC_BIT(15) #define P8_OCB_OCI_OCIMISC_MASK (OCB_OCI_OCIMISC_IRQ_TMGT | \ OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY | \ OCB_OCI_OCIMISC_IRQ_SLW_TMR) #define OCB_OCI_OCIMISC_IRQ_I2C PPC_BIT(2) #define OCB_OCI_OCIMISC_IRQ_SHMEM PPC_BIT(3) #define P9_OCB_OCI_OCIMISC_MASK (OCB_OCI_OCIMISC_IRQ_TMGT | \ OCB_OCI_OCIMISC_IRQ_I2C | \ OCB_OCI_OCIMISC_IRQ_SHMEM | \ OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY) void occ_send_dummy_interrupt(void) { struct psi *psi; struct proc_chip *chip = get_chip(this_cpu()->chip_id); /* Emulators don't do this */ if (chip_quirk(QUIRK_NO_OCC_IRQ)) return; /* Find a functional PSI. This ensures an interrupt even if * the psihb on the current chip is not configured */ if (chip->psi) psi = chip->psi; else psi = psi_find_functional_chip(); if (!psi) { prlog_once(PR_WARNING, "PSI: no functional PSI HB found, " "no self interrupts delivered\n"); return; } switch (proc_gen) { case proc_gen_p8: xscom_write(psi->chip_id, P8_OCB_OCI_OCCMISC_OR, OCB_OCI_OCIMISC_IRQ | OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY); break; case proc_gen_p9: xscom_write(psi->chip_id, P9_OCB_OCI_OCCMISC_OR, OCB_OCI_OCIMISC_IRQ | OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY); break; case proc_gen_p10: xscom_write(psi->chip_id, P9_OCB_OCI_OCCMISC_OR, OCB_OCI_OCIMISC_IRQ | OCB_OCI_OCIMISC_IRQ_OPAL_DUMMY); break; default: break; } } void occ_p8_interrupt(uint32_t chip_id) { uint64_t ireg; int64_t rc; /* The OCC interrupt is used to mux up to 15 different sources */ rc = xscom_read(chip_id, P8_OCB_OCI_OCCMISC, &ireg); if (rc) { prerror("OCC: Failed to read interrupt status !\n"); /* Should we mask it in the XIVR ? */ return; } prlog(PR_TRACE, "OCC: IRQ received: %04llx\n", ireg >> 48); /* Clear the bits */ xscom_write(chip_id, P8_OCB_OCI_OCCMISC_AND, ~ireg); /* Dispatch */ if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT) prd_tmgt_interrupt(chip_id); if (ireg & OCB_OCI_OCIMISC_IRQ_SLW_TMR) check_timers(true); /* We may have masked-out OCB_OCI_OCIMISC_IRQ in the previous * OCCMISC_AND write. Check if there are any new source bits set, * and trigger another interrupt if so. */ rc = xscom_read(chip_id, P8_OCB_OCI_OCCMISC, &ireg); if (!rc && (ireg & P8_OCB_OCI_OCIMISC_MASK)) xscom_write(chip_id, P8_OCB_OCI_OCCMISC_OR, OCB_OCI_OCIMISC_IRQ); } void occ_p9_interrupt(uint32_t chip_id) { u64 ireg; s64 rc; /* The OCC interrupt is used to mux up to 15 different sources */ rc = xscom_read(chip_id, P9_OCB_OCI_OCCMISC, &ireg); if (rc) { prerror("OCC: Failed to read interrupt status !\n"); return; } prlog(PR_TRACE, "OCC: IRQ received: %04llx\n", ireg >> 48); /* Clear the bits */ xscom_write(chip_id, P9_OCB_OCI_OCCMISC_CLEAR, ireg); /* Dispatch */ if (ireg & OCB_OCI_OCIMISC_IRQ_TMGT) prd_tmgt_interrupt(chip_id); if (ireg & OCB_OCI_OCIMISC_IRQ_SHMEM) { occ_throttle_poll(NULL); handle_occ_rsp(chip_id); } if (ireg & OCB_OCI_OCIMISC_IRQ_I2C) p9_i2c_bus_owner_change(chip_id); /* We may have masked-out OCB_OCI_OCIMISC_IRQ in the previous * OCCMISC_AND write. Check if there are any new source bits set, * and trigger another interrupt if so. */ rc = xscom_read(chip_id, P9_OCB_OCI_OCCMISC, &ireg); if (!rc && (ireg & P9_OCB_OCI_OCIMISC_MASK)) xscom_write(chip_id, P9_OCB_OCI_OCCMISC_OR, OCB_OCI_OCIMISC_IRQ); }