From 048706d971c1830d7813052ca027ae00c519e894 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:10 +0000
Subject: pseries: Fix and cleanup CPU initialization and reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current pseries machine init function iterates over the CPUs at several
points, doing various bits of initialization.  This is messy; these can
and should be merged into a single iteration doing all the necessary per
cpu initialization.  Worse, some of these initializations were setting up
state which should be set on every reset, not just at machine init time.
A few of the initializations simply weren't necessary at all.

This patch, therefore, moves those things that need to be to the
per-cpu reset handler, and combines the remainder into two loops over
the cpus (which also creates them).  The second loop is for setting up
hash table information, and will be removed in a subsequent patch also
making other fixes to the hash table setup.

This exposes a bug in our start-cpu RTAS routine (called by the guest to
start up CPUs other than CPU0) under kvm.  Previously, this function did
not make a call to ensure that it's changes to the new cpu's state were
pushed into KVM in-kernel state.  We sort-of got away with this because
some of the initializations had already placed the secondary CPUs into the
right starting state for the sorts of Linux guests we've been running.

Nonetheless the start-cpu RTAS call's behaviour was not correct and could
easily have been broken by guest changes.  This patch also fixes it.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr.c      | 34 ++++++++++++++++++++--------------
 hw/spapr_rtas.c |  5 +++++
 2 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'hw')

diff --git a/hw/spapr.c b/hw/spapr.c
index c34b767..d88525a 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -581,8 +581,16 @@ static void spapr_reset(void *opaque)
 static void spapr_cpu_reset(void *opaque)
 {
     PowerPCCPU *cpu = opaque;
+    CPUPPCState *env = &cpu->env;
 
     cpu_reset(CPU(cpu));
+
+    /* All CPUs start halted.  CPU0 is unhalted from the machine level
+     * reset code and the rest are explicitly started up by the guest
+     * using an RTAS call */
+    env->halted = 1;
+
+    env->spr[SPR_HIOR] = 0;
 }
 
 /* Returns whether we want to use VGA or not */
@@ -665,11 +673,16 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 
         /* Set time-base frequency to 512 MHz */
         cpu_ppc_tb_init(env, TIMEBASE_FREQ);
-        qemu_register_reset(spapr_cpu_reset, cpu);
 
-        env->hreset_vector = 0x60;
+        /* PAPR always has exception vectors in RAM not ROM */
         env->hreset_excp_prefix = 0;
-        env->gpr[3] = env->cpu_index;
+
+        /* Tell KVM that we're in PAPR mode */
+        if (kvm_enabled()) {
+            kvmppc_set_papr(env);
+        }
+
+        qemu_register_reset(spapr_cpu_reset, cpu);
     }
 
     /* allocate RAM */
@@ -685,7 +698,10 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 
     /* allocate hash page table.  For now we always make this 16mb,
      * later we should probably make it scale to the size of guest
-     * RAM */
+     * RAM.  FIXME: setting the htab information in the CPU env really
+     * belongs at CPU reset time, but we can get away with it for now
+     * because the PAPR guest is not permitted to write SDR1 so in
+     * fact these settings will never change during the run */
     spapr->htab_size = 1ULL << (pteg_shift + 7);
     spapr->htab = qemu_memalign(spapr->htab_size, spapr->htab_size);
 
@@ -697,11 +713,6 @@ static void ppc_spapr_init(ram_addr_t ram_size,
         /* Tell KVM that we're in PAPR mode */
         env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
                              ((pteg_shift + 7) - 18);
-        env->spr[SPR_HIOR] = 0;
-
-        if (kvm_enabled()) {
-            kvmppc_set_papr(env);
-        }
     }
 
     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
@@ -827,11 +838,6 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 
     spapr->entry_point = 0x100;
 
-    /* SLOF will startup the secondary CPUs using RTAS */
-    for (env = first_cpu; env != NULL; env = env->next_cpu) {
-        env->halted = 1;
-    }
-
     /* Prepare the device tree */
     spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size,
                                             initrd_base, initrd_size,
diff --git a/hw/spapr_rtas.c b/hw/spapr_rtas.c
index ae18595..b808f80 100644
--- a/hw/spapr_rtas.c
+++ b/hw/spapr_rtas.c
@@ -184,6 +184,11 @@ static void rtas_start_cpu(sPAPREnvironment *spapr,
             return;
         }
 
+        /* This will make sure qemu state is up to date with kvm, and
+         * mark it dirty so our changes get flushed back before the
+         * new cpu enters */
+        kvm_cpu_synchronize_state(env);
+
         env->msr = (1ULL << MSR_SF) | (1ULL << MSR_ME);
         env->nip = start;
         env->gpr[3] = r3;
-- 
cgit v1.1


From c8787ad477f3be5a971b877dcb1bae5752c5796a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:11 +0000
Subject: pseries: Use new method to correct reset sequence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A number of things need to occur during reset of the PAPR
paravirtualized platform in a specific order.  For example, the hash
table needs to be cleared before the CPUs are reset, so that they
initialize their register state correctly, and the CPUs need to have
their main reset called before we set up the entry point state on the
boot cpu.  We also need to have the main qdev reset happen before the
creation and installation of the device tree for the new boot, because
we need the state of the devices settled to correctly construct the
device tree.

We currently do the pseries once-per-reset initializations done from a
reset handler.  However we can't adequately control when this handler
is called during the reset - in particular we can't guarantee it
happens after all the qdev resets (since qdevs might be registered
after the machine init function has executed).

This patch uses the new QEMUMachine reset method to to fix this
problem, ensuring the various order dependent reset steps happen in
the correct order.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'hw')

diff --git a/hw/spapr.c b/hw/spapr.c
index d88525a..68542e8 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -559,13 +559,13 @@ static void emulate_spapr_hypercall(CPUPPCState *env)
     env->gpr[3] = spapr_hypercall(env, env->gpr[3], &env->gpr[4]);
 }
 
-static void spapr_reset(void *opaque)
+static void ppc_spapr_reset(void)
 {
-    sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
-
     /* flush out the hash table */
     memset(spapr->htab, 0, spapr->htab_size);
 
+    qemu_devices_reset();
+
     /* Load the fdt */
     spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
                        spapr->rtas_size);
@@ -845,14 +845,13 @@ static void ppc_spapr_init(ram_addr_t ram_size,
                                             boot_device, kernel_cmdline,
                                             pteg_shift + 7);
     assert(spapr->fdt_skel != NULL);
-
-    qemu_register_reset(spapr_reset, spapr);
 }
 
 static QEMUMachine spapr_machine = {
     .name = "pseries",
     .desc = "pSeries Logical Partition (PAPR compliant)",
     .init = ppc_spapr_init,
+    .reset = ppc_spapr_reset,
     .max_cpus = MAX_CPUS,
     .no_parallel = 1,
     .use_scsi = 1,
-- 
cgit v1.1


From 7f763a5d994bbddb50705d2e50decdf52937521f Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:12 +0000
Subject: pseries: Add support for new KVM hash table control call

This adds support for then new "reset htab" ioctl which allows qemu
to properly cleanup the MMU hash table when the guest is reset. With
the corresponding kernel support, reset of a guest now works properly.

This also paves the way for indicating a different size hash table
to the kernel and for the kernel to be able to impose limits on
the requested size.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr.c | 274 ++++++++++++++++++++++++++++++++++++-------------------------
 hw/spapr.h |   4 +-
 2 files changed, 165 insertions(+), 113 deletions(-)

(limited to 'hw')

diff --git a/hw/spapr.c b/hw/spapr.c
index 68542e8..0a0e9cd 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -85,6 +85,8 @@
 
 #define PHANDLE_XICP            0x00001111
 
+#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
+
 sPAPREnvironment *spapr;
 
 int spapr_allocate_irq(int hint, enum xics_irq_type type)
@@ -134,12 +136,13 @@ int spapr_allocate_irq_block(int num, enum xics_irq_type type)
     return first;
 }
 
-static int spapr_set_associativity(void *fdt, sPAPREnvironment *spapr)
+static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
 {
     int ret = 0, offset;
     CPUPPCState *env;
     char cpu_model[32];
     int smt = kvmppc_smt_threads();
+    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
 
     assert(spapr->cpu_model);
 
@@ -163,8 +166,16 @@ static int spapr_set_associativity(void *fdt, sPAPREnvironment *spapr)
             return offset;
         }
 
-        ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
-                          sizeof(associativity));
+        if (nb_numa_nodes > 1) {
+            ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
+                              sizeof(associativity));
+            if (ret < 0) {
+                return ret;
+            }
+        }
+
+        ret = fdt_setprop(fdt, offset, "ibm,pft-size",
+                          pft_size_prop, sizeof(pft_size_prop));
         if (ret < 0) {
             return ret;
         }
@@ -206,45 +217,36 @@ static size_t create_page_sizes_prop(CPUPPCState *env, uint32_t *prop,
     return (p - prop) * sizeof(uint32_t);
 }
 
+#define _FDT(exp) \
+    do { \
+        int ret = (exp);                                           \
+        if (ret < 0) {                                             \
+            fprintf(stderr, "qemu: error creating device tree: %s: %s\n", \
+                    #exp, fdt_strerror(ret));                      \
+            exit(1);                                               \
+        }                                                          \
+    } while (0)
+
+
 static void *spapr_create_fdt_skel(const char *cpu_model,
-                                   target_phys_addr_t rma_size,
                                    target_phys_addr_t initrd_base,
                                    target_phys_addr_t initrd_size,
                                    target_phys_addr_t kernel_size,
                                    const char *boot_device,
-                                   const char *kernel_cmdline,
-                                   long hash_shift)
+                                   const char *kernel_cmdline)
 {
     void *fdt;
     CPUPPCState *env;
-    uint64_t mem_reg_property[2];
     uint32_t start_prop = cpu_to_be32(initrd_base);
     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
-    uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
     char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
         "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk";
     char qemu_hypertas_prop[] = "hcall-memop1";
+    uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
     uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)};
-    int i;
     char *modelname;
-    int smt = kvmppc_smt_threads();
+    int i, smt = kvmppc_smt_threads();
     unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80};
-    uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)};
-    uint32_t associativity[] = {cpu_to_be32(0x4), cpu_to_be32(0x0),
-                                cpu_to_be32(0x0), cpu_to_be32(0x0),
-                                cpu_to_be32(0x0)};
-    char mem_name[32];
-    target_phys_addr_t node0_size, mem_start;
-
-#define _FDT(exp) \
-    do { \
-        int ret = (exp);                                           \
-        if (ret < 0) {                                             \
-            fprintf(stderr, "qemu: error creating device tree: %s: %s\n", \
-                    #exp, fdt_strerror(ret));                      \
-            exit(1);                                               \
-        }                                                          \
-    } while (0)
 
     fdt = g_malloc0(FDT_MAX_SIZE);
     _FDT((fdt_create(fdt, FDT_MAX_SIZE)));
@@ -289,55 +291,6 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
 
     _FDT((fdt_end_node(fdt)));
 
-    /* memory node(s) */
-    node0_size = (nb_numa_nodes > 1) ? node_mem[0] : ram_size;
-    if (rma_size > node0_size) {
-        rma_size = node0_size;
-    }
-
-    /* RMA */
-    mem_reg_property[0] = 0;
-    mem_reg_property[1] = cpu_to_be64(rma_size);
-    _FDT((fdt_begin_node(fdt, "memory@0")));
-    _FDT((fdt_property_string(fdt, "device_type", "memory")));
-    _FDT((fdt_property(fdt, "reg", mem_reg_property,
-        sizeof(mem_reg_property))));
-    _FDT((fdt_property(fdt, "ibm,associativity", associativity,
-        sizeof(associativity))));
-    _FDT((fdt_end_node(fdt)));
-
-    /* RAM: Node 0 */
-    if (node0_size > rma_size) {
-        mem_reg_property[0] = cpu_to_be64(rma_size);
-        mem_reg_property[1] = cpu_to_be64(node0_size - rma_size);
-
-        sprintf(mem_name, "memory@" TARGET_FMT_lx, rma_size);
-        _FDT((fdt_begin_node(fdt, mem_name)));
-        _FDT((fdt_property_string(fdt, "device_type", "memory")));
-        _FDT((fdt_property(fdt, "reg", mem_reg_property,
-                           sizeof(mem_reg_property))));
-        _FDT((fdt_property(fdt, "ibm,associativity", associativity,
-                           sizeof(associativity))));
-        _FDT((fdt_end_node(fdt)));
-    }
-
-    /* RAM: Node 1 and beyond */
-    mem_start = node0_size;
-    for (i = 1; i < nb_numa_nodes; i++) {
-        mem_reg_property[0] = cpu_to_be64(mem_start);
-        mem_reg_property[1] = cpu_to_be64(node_mem[i]);
-        associativity[3] = associativity[4] = cpu_to_be32(i);
-        sprintf(mem_name, "memory@" TARGET_FMT_lx, mem_start);
-        _FDT((fdt_begin_node(fdt, mem_name)));
-        _FDT((fdt_property_string(fdt, "device_type", "memory")));
-        _FDT((fdt_property(fdt, "reg", mem_reg_property,
-            sizeof(mem_reg_property))));
-        _FDT((fdt_property(fdt, "ibm,associativity", associativity,
-            sizeof(associativity))));
-        _FDT((fdt_end_node(fdt)));
-        mem_start += node_mem[i];
-    }
-
     /* cpus */
     _FDT((fdt_begin_node(fdt, "cpus")));
 
@@ -389,8 +342,6 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
         _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
         _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
         _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
-        _FDT((fdt_property(fdt, "ibm,pft-size",
-                           pft_size_prop, sizeof(pft_size_prop))));
         _FDT((fdt_property_string(fdt, "status", "okay")));
         _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
 
@@ -489,6 +440,68 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
     return fdt;
 }
 
+static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt)
+{
+    uint32_t associativity[] = {cpu_to_be32(0x4), cpu_to_be32(0x0),
+                                cpu_to_be32(0x0), cpu_to_be32(0x0),
+                                cpu_to_be32(0x0)};
+    char mem_name[32];
+    target_phys_addr_t node0_size, mem_start;
+    uint64_t mem_reg_property[2];
+    int i, off;
+
+    /* memory node(s) */
+    node0_size = (nb_numa_nodes > 1) ? node_mem[0] : ram_size;
+    if (spapr->rma_size > node0_size) {
+        spapr->rma_size = node0_size;
+    }
+
+    /* RMA */
+    mem_reg_property[0] = 0;
+    mem_reg_property[1] = cpu_to_be64(spapr->rma_size);
+    off = fdt_add_subnode(fdt, 0, "memory@0");
+    _FDT(off);
+    _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
+    _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
+                      sizeof(mem_reg_property))));
+    _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
+                      sizeof(associativity))));
+
+    /* RAM: Node 0 */
+    if (node0_size > spapr->rma_size) {
+        mem_reg_property[0] = cpu_to_be64(spapr->rma_size);
+        mem_reg_property[1] = cpu_to_be64(node0_size - spapr->rma_size);
+
+        sprintf(mem_name, "memory@" TARGET_FMT_lx, spapr->rma_size);
+        off = fdt_add_subnode(fdt, 0, mem_name);
+        _FDT(off);
+        _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
+        _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
+                          sizeof(mem_reg_property))));
+        _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
+                          sizeof(associativity))));
+    }
+
+    /* RAM: Node 1 and beyond */
+    mem_start = node0_size;
+    for (i = 1; i < nb_numa_nodes; i++) {
+        mem_reg_property[0] = cpu_to_be64(mem_start);
+        mem_reg_property[1] = cpu_to_be64(node_mem[i]);
+        associativity[3] = associativity[4] = cpu_to_be32(i);
+        sprintf(mem_name, "memory@" TARGET_FMT_lx, mem_start);
+        off = fdt_add_subnode(fdt, 0, mem_name);
+        _FDT(off);
+        _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
+        _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
+                          sizeof(mem_reg_property))));
+        _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
+                          sizeof(associativity))));
+        mem_start += node_mem[i];
+    }
+
+    return 0;
+}
+
 static void spapr_finalize_fdt(sPAPREnvironment *spapr,
                                target_phys_addr_t fdt_addr,
                                target_phys_addr_t rtas_addr,
@@ -503,6 +516,12 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr,
     /* open out the base tree into a temp buffer for the final tweaks */
     _FDT((fdt_open_into(spapr->fdt_skel, fdt, FDT_MAX_SIZE)));
 
+    ret = spapr_populate_memory(spapr, fdt);
+    if (ret < 0) {
+        fprintf(stderr, "couldn't setup memory nodes in fdt\n");
+        exit(1);
+    }
+
     ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
     if (ret < 0) {
         fprintf(stderr, "couldn't setup vio devices in fdt\n");
@@ -525,11 +544,9 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr,
     }
 
     /* Advertise NUMA via ibm,associativity */
-    if (nb_numa_nodes > 1) {
-        ret = spapr_set_associativity(fdt, spapr);
-        if (ret < 0) {
-            fprintf(stderr, "Couldn't set up NUMA device tree properties\n");
-        }
+    ret = spapr_fixup_cpu_dt(fdt, spapr);
+    if (ret < 0) {
+        fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
     }
 
     if (!spapr->has_graphics) {
@@ -559,10 +576,39 @@ static void emulate_spapr_hypercall(CPUPPCState *env)
     env->gpr[3] = spapr_hypercall(env, env->gpr[3], &env->gpr[4]);
 }
 
+static void spapr_reset_htab(sPAPREnvironment *spapr)
+{
+    long shift;
+
+    /* allocate hash page table.  For now we always make this 16mb,
+     * later we should probably make it scale to the size of guest
+     * RAM */
+
+    shift = kvmppc_reset_htab(spapr->htab_shift);
+
+    if (shift > 0) {
+        /* Kernel handles htab, we don't need to allocate one */
+        spapr->htab_shift = shift;
+    } else {
+        if (!spapr->htab) {
+            /* Allocate an htab if we don't yet have one */
+            spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
+        }
+
+        /* And clear it */
+        memset(spapr->htab, 0, HTAB_SIZE(spapr));
+    }
+
+    /* Update the RMA size if necessary */
+    if (spapr->vrma_adjust) {
+        spapr->rma_size = kvmppc_rma_size(ram_size, spapr->htab_shift);
+    }
+}
+
 static void ppc_spapr_reset(void)
 {
-    /* flush out the hash table */
-    memset(spapr->htab, 0, spapr->htab_size);
+    /* Reset the hash table & recalc the RMA */
+    spapr_reset_htab(spapr);
 
     qemu_devices_reset();
 
@@ -591,6 +637,12 @@ static void spapr_cpu_reset(void *opaque)
     env->halted = 1;
 
     env->spr[SPR_HIOR] = 0;
+
+    env->external_htab = spapr->htab;
+    env->htab_base = -1;
+    env->htab_mask = HTAB_SIZE(spapr) - 1;
+    env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
+        (spapr->htab_shift - 18);
 }
 
 /* Returns whether we want to use VGA or not */
@@ -624,11 +676,10 @@ static void ppc_spapr_init(ram_addr_t ram_size,
     int i;
     MemoryRegion *sysmem = get_system_memory();
     MemoryRegion *ram = g_new(MemoryRegion, 1);
-    target_phys_addr_t rma_alloc_size, rma_size;
+    target_phys_addr_t rma_alloc_size;
     uint32_t initrd_base = 0;
     long kernel_size = 0, initrd_size = 0;
     long load_limit, rtas_limit, fw_size;
-    long pteg_shift = 17;
     char *filename;
 
     msi_supported = true;
@@ -645,20 +696,39 @@ static void ppc_spapr_init(ram_addr_t ram_size,
         hw_error("qemu: Unable to create RMA\n");
         exit(1);
     }
+
     if (rma_alloc_size && (rma_alloc_size < ram_size)) {
-        rma_size = rma_alloc_size;
+        spapr->rma_size = rma_alloc_size;
     } else {
-        rma_size = ram_size;
+        spapr->rma_size = ram_size;
+
+        /* With KVM, we don't actually know whether KVM supports an
+         * unbounded RMA (PR KVM) or is limited by the hash table size
+         * (HV KVM using VRMA), so we always assume the latter
+         *
+         * In that case, we also limit the initial allocations for RTAS
+         * etc... to 256M since we have no way to know what the VRMA size
+         * is going to be as it depends on the size of the hash table
+         * isn't determined yet.
+         */
+        if (kvm_enabled()) {
+            spapr->vrma_adjust = 1;
+            spapr->rma_size = MIN(spapr->rma_size, 0x10000000);
+        }
     }
 
     /* We place the device tree and RTAS just below either the top of the RMA,
      * or just below 2GB, whichever is lowere, so that it can be
      * processed with 32-bit real mode code if necessary */
-    rtas_limit = MIN(rma_size, 0x80000000);
+    rtas_limit = MIN(spapr->rma_size, 0x80000000);
     spapr->rtas_addr = rtas_limit - RTAS_MAX_SIZE;
     spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
     load_limit = spapr->fdt_addr - FW_OVERHEAD;
 
+    /* For now, always aim for a 16MB hash table */
+    /* FIXME: we should change this default based on RAM size */
+    spapr->htab_shift = 24;
+
     /* init CPUs */
     if (cpu_model == NULL) {
         cpu_model = kvm_enabled() ? "host" : "POWER7";
@@ -696,25 +766,6 @@ static void ppc_spapr_init(ram_addr_t ram_size,
         memory_region_add_subregion(sysmem, nonrma_base, ram);
     }
 
-    /* allocate hash page table.  For now we always make this 16mb,
-     * later we should probably make it scale to the size of guest
-     * RAM.  FIXME: setting the htab information in the CPU env really
-     * belongs at CPU reset time, but we can get away with it for now
-     * because the PAPR guest is not permitted to write SDR1 so in
-     * fact these settings will never change during the run */
-    spapr->htab_size = 1ULL << (pteg_shift + 7);
-    spapr->htab = qemu_memalign(spapr->htab_size, spapr->htab_size);
-
-    for (env = first_cpu; env != NULL; env = env->next_cpu) {
-        env->external_htab = spapr->htab;
-        env->htab_base = -1;
-        env->htab_mask = spapr->htab_size - 1;
-
-        /* Tell KVM that we're in PAPR mode */
-        env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
-                             ((pteg_shift + 7) - 18);
-    }
-
     filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
     spapr->rtas_size = load_image_targphys(filename, spapr->rtas_addr,
                                            rtas_limit - spapr->rtas_addr);
@@ -787,7 +838,7 @@ static void ppc_spapr_init(ram_addr_t ram_size,
         }
     }
 
-    if (rma_size < (MIN_RMA_SLOF << 20)) {
+    if (spapr->rma_size < (MIN_RMA_SLOF << 20)) {
         fprintf(stderr, "qemu: pSeries SLOF firmware requires >= "
                 "%ldM guest RMA (Real Mode Area memory)\n", MIN_RMA_SLOF);
         exit(1);
@@ -839,11 +890,10 @@ static void ppc_spapr_init(ram_addr_t ram_size,
     spapr->entry_point = 0x100;
 
     /* Prepare the device tree */
-    spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size,
+    spapr->fdt_skel = spapr_create_fdt_skel(cpu_model,
                                             initrd_base, initrd_size,
                                             kernel_size,
-                                            boot_device, kernel_cmdline,
-                                            pteg_shift + 7);
+                                            boot_device, kernel_cmdline);
     assert(spapr->fdt_skel != NULL);
 }
 
diff --git a/hw/spapr.h b/hw/spapr.h
index ac34a17..f1fb646 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -15,7 +15,9 @@ typedef struct sPAPREnvironment {
 
     target_phys_addr_t ram_limit;
     void *htab;
-    long htab_size;
+    long htab_shift;
+    target_phys_addr_t rma_size;
+    int vrma_adjust;
     target_phys_addr_t fdt_addr, rtas_addr;
     long rtas_size;
     void *fdt_skel;
-- 
cgit v1.1


From 4dd96f244f62d5e4b493c1f4071c0d4a4a57474d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:13 +0000
Subject: pseries: Clear TCE and signal state when resetting PAPR VIO devices

When we reset the system, the reset method for VIO bus devices resets
the state of their request queue (if present) as it should.  However
it was not resetting the state of their TCE table (DMA translation) if
present.  It was also not resetting the state of the per-device signal
mask set with H_VIO_SIGNAL.  This patch corrects both bugs, and also
removes some small code duplication in the reset paths.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr_vio.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'hw')

diff --git a/hw/spapr_vio.c b/hw/spapr_vio.c
index 7ca4452..752836e 100644
--- a/hw/spapr_vio.c
+++ b/hw/spapr_vio.c
@@ -324,9 +324,7 @@ static void spapr_vio_quiesce_one(VIOsPAPRDevice *dev)
     }
     dev->dma = spapr_tce_new_dma_context(liobn, pc->rtce_window_size);
 
-    dev->crq.qladdr = 0;
-    dev->crq.qsize = 0;
-    dev->crq.qnext = 0;
+    free_crq(dev);
 }
 
 static void rtas_set_tce_bypass(sPAPREnvironment *spapr, uint32_t token,
@@ -409,9 +407,10 @@ static void spapr_vio_busdev_reset(DeviceState *qdev)
     VIOsPAPRDevice *dev = DO_UPCAST(VIOsPAPRDevice, qdev, qdev);
     VIOsPAPRDeviceClass *pc = VIO_SPAPR_DEVICE_GET_CLASS(dev);
 
-    if (dev->crq.qsize) {
-        free_crq(dev);
-    }
+    /* Shut down the request queue and TCEs if necessary */
+    spapr_vio_quiesce_one(dev);
+
+    dev->signal_state = 0;
 
     if (pc->reset) {
         pc->reset(dev);
-- 
cgit v1.1


From eddeed26ac83392053aef823a341f643ea8e3d2f Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:14 +0000
Subject: pseries: Reset emulated PCI TCE tables on system reset

The emulated PCI host bridge on the pseries machine incorporates an IOMMU
(PAPR TCE table).  Currently the mappings in this IOMMU are not cleared
when we reset the system.  This patch fixes this bug.  To do this it adds
a new reset function to the IOMMU emulation code.  The VIO devices already
reset their TCE tables, but they do so by destroying and re-creating their
DMA context.  This doesn't work for the PCI host bridge, because the
infrastructure for PCI IOMMUs has already copied/cached the DMA pointer
context into the subordinate PCI device structures.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr.h       |  1 +
 hw/spapr_iommu.c | 11 +++++++++++
 hw/spapr_pci.c   | 10 ++++++++++
 3 files changed, 22 insertions(+)

(limited to 'hw')

diff --git a/hw/spapr.h b/hw/spapr.h
index f1fb646..f9a7b0f 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -338,6 +338,7 @@ typedef struct sPAPRTCE {
 void spapr_iommu_init(void);
 DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size);
 void spapr_tce_free(DMAContext *dma);
+void spapr_tce_reset(DMAContext *dma);
 int spapr_dma_dt(void *fdt, int node_off, const char *propname,
                  uint32_t liobn, uint64_t window, uint32_t size);
 int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
index 53b7317..216aa06 100644
--- a/hw/spapr_iommu.c
+++ b/hw/spapr_iommu.c
@@ -162,6 +162,17 @@ void spapr_tce_free(DMAContext *dma)
     }
 }
 
+void spapr_tce_reset(DMAContext *dma)
+{
+    if (dma) {
+        sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, dma);
+        size_t table_size = (tcet->window_size >> SPAPR_TCE_PAGE_SHIFT)
+            * sizeof(sPAPRTCE);
+
+        memset(tcet->table, 0, table_size);
+    }
+}
+
 static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba,
                                 target_ulong tce)
 {
diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
index 661c05b..203155e 100644
--- a/hw/spapr_pci.c
+++ b/hw/spapr_pci.c
@@ -595,6 +595,15 @@ static int spapr_phb_init(SysBusDevice *s)
     return 0;
 }
 
+static void spapr_phb_reset(DeviceState *qdev)
+{
+    SysBusDevice *s = sysbus_from_qdev(qdev);
+    sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
+
+    /* Reset the IOMMU state */
+    spapr_tce_reset(sphb->dma);
+}
+
 static Property spapr_phb_properties[] = {
     DEFINE_PROP_HEX64("buid", sPAPRPHBState, buid, 0),
     DEFINE_PROP_STRING("busname", sPAPRPHBState, busname),
@@ -613,6 +622,7 @@ static void spapr_phb_class_init(ObjectClass *klass, void *data)
 
     sdc->init = spapr_phb_init;
     dc->props = spapr_phb_properties;
+    dc->reset = spapr_phb_reset;
 }
 
 static const TypeInfo spapr_phb_info = {
-- 
cgit v1.1


From 256b408abea2cfe18d8c0278e5b46213509db271 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:15 +0000
Subject: pseries: Fix XICS reset

The XICS interrupt controller used on the pseries machine currently has no
reset handler.  We can get away with this under some circumstances, but
it's not correct, and can cause failures if the XICS happens to be in the
wrong state at the time of reset.

This patch adds a hook to properly reset the XICS state.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/xics.c | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

(limited to 'hw')

diff --git a/hw/xics.c b/hw/xics.c
index b674771..a8a08ce 100644
--- a/hw/xics.c
+++ b/hw/xics.c
@@ -489,11 +489,36 @@ static void rtas_int_on(sPAPREnvironment *spapr, uint32_t token,
     rtas_st(rets, 0, 0); /* Success */
 }
 
+static void xics_reset(void *opaque)
+{
+    struct icp_state *icp = (struct icp_state *)opaque;
+    struct ics_state *ics = icp->ics;
+    int i;
+
+    for (i = 0; i < icp->nr_servers; i++) {
+        icp->ss[i].xirr = 0;
+        icp->ss[i].pending_priority = 0;
+        icp->ss[i].mfrr = 0xff;
+        /* Make all outputs are deasserted */
+        qemu_set_irq(icp->ss[i].output, 0);
+    }
+
+    for (i = 0; i < ics->nr_irqs; i++) {
+        /* Reset everything *except* the type */
+        ics->irqs[i].server = 0;
+        ics->irqs[i].asserted = 0;
+        ics->irqs[i].sent = 0;
+        ics->irqs[i].rejected = 0;
+        ics->irqs[i].masked_pending = 0;
+        ics->irqs[i].priority = 0xff;
+        ics->irqs[i].saved_priority = 0xff;
+    }
+}
+
 struct icp_state *xics_system_init(int nr_irqs)
 {
     CPUPPCState *env;
     int max_server_num;
-    int i;
     struct icp_state *icp;
     struct ics_state *ics;
 
@@ -508,10 +533,6 @@ struct icp_state *xics_system_init(int nr_irqs)
     icp->nr_servers = max_server_num + 1;
     icp->ss = g_malloc0(icp->nr_servers*sizeof(struct icp_server_state));
 
-    for (i = 0; i < icp->nr_servers; i++) {
-        icp->ss[i].mfrr = 0xff;
-    }
-
     for (env = first_cpu; env != NULL; env = env->next_cpu) {
         struct icp_server_state *ss = &icp->ss[env->cpu_index];
 
@@ -539,11 +560,6 @@ struct icp_state *xics_system_init(int nr_irqs)
     icp->ics = ics;
     ics->icp = icp;
 
-    for (i = 0; i < nr_irqs; i++) {
-        ics->irqs[i].priority = 0xff;
-        ics->irqs[i].saved_priority = 0xff;
-    }
-
     ics->qirqs = qemu_allocate_irqs(ics_set_irq, ics, nr_irqs);
 
     spapr_register_hypercall(H_CPPR, h_cppr);
@@ -556,5 +572,7 @@ struct icp_state *xics_system_init(int nr_irqs)
     spapr_rtas_register("ibm,int-off", rtas_int_off);
     spapr_rtas_register("ibm,int-on", rtas_int_on);
 
+    qemu_register_reset(xics_reset, icp);
+
     return icp;
 }
-- 
cgit v1.1


From 1dd088946cf464a994bc93945a360aef049493af Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:16 +0000
Subject: pseries: Small cleanup to H_CEDE implementation

The H_CEDE hypercall implementation for the pseries machine doesn't trigger
quite the right path in the main cpu exec loop.  We should set exit_request
to pop up one extra level and recheck state, and we should set the
exception_index to EXCP_HLT (H_CEDE is roughly equivalent to the hlt
instruction on x86).

In practice, this doesn't really matter except for KVM, and KVM implements
H_CEDE internally so we never hit this code path.  But we might as well
get it right, just in case it matters some day.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr_hcall.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'hw')

diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
index abd847f..2df94d1 100644
--- a/hw/spapr_hcall.c
+++ b/hw/spapr_hcall.c
@@ -544,6 +544,8 @@ static target_ulong h_cede(CPUPPCState *env, sPAPREnvironment *spapr,
     hreg_compute_hflags(env);
     if (!cpu_has_work(env)) {
         env->halted = 1;
+        env->exception_index = EXCP_HLT;
+        env->exit_request = 1;
     }
     return H_SUCCESS;
 }
-- 
cgit v1.1


From 98ca8c023825fc6dd99e6cea1956d84ed8cadb3a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:17 +0000
Subject: pseries: Remove C bitfields from xics code

The XICS interrupt controller emulation uses some C bitfield variables in
its internal state structure.  This makes like awkward for saving the state
because we don't have easy VMSTATE helpers for bitfields.

This patch removes the bitfields, instead using explicit bit masking in a
single status variable.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/xics.c | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'hw')

diff --git a/hw/xics.c b/hw/xics.c
index a8a08ce..648af25 100644
--- a/hw/xics.c
+++ b/hw/xics.c
@@ -165,11 +165,12 @@ struct ics_irq_state {
     int server;
     uint8_t priority;
     uint8_t saved_priority;
+#define XICS_STATUS_ASSERTED           0x1
+#define XICS_STATUS_SENT               0x2
+#define XICS_STATUS_REJECTED           0x4
+#define XICS_STATUS_MASKED_PENDING     0x8
+    uint8_t status;
     enum xics_irq_type type;
-    int asserted:1;
-    int sent:1;
-    int rejected:1;
-    int masked_pending:1;
 };
 
 struct ics_state {
@@ -191,8 +192,8 @@ static void resend_msi(struct ics_state *ics, int srcno)
     struct ics_irq_state *irq = ics->irqs + srcno;
 
     /* FIXME: filter by server#? */
-    if (irq->rejected) {
-        irq->rejected = 0;
+    if (irq->status & XICS_STATUS_REJECTED) {
+        irq->status &= ~XICS_STATUS_REJECTED;
         if (irq->priority != 0xff) {
             icp_irq(ics->icp, irq->server, srcno + ics->offset,
                     irq->priority);
@@ -204,8 +205,10 @@ static void resend_lsi(struct ics_state *ics, int srcno)
 {
     struct ics_irq_state *irq = ics->irqs + srcno;
 
-    if ((irq->priority != 0xff) && irq->asserted && !irq->sent) {
-        irq->sent = 1;
+    if ((irq->priority != 0xff)
+        && (irq->status & XICS_STATUS_ASSERTED)
+        && !(irq->status & XICS_STATUS_SENT)) {
+        irq->status |= XICS_STATUS_SENT;
         icp_irq(ics->icp, irq->server, srcno + ics->offset, irq->priority);
     }
 }
@@ -216,7 +219,7 @@ static void set_irq_msi(struct ics_state *ics, int srcno, int val)
 
     if (val) {
         if (irq->priority == 0xff) {
-            irq->masked_pending = 1;
+            irq->status |= XICS_STATUS_MASKED_PENDING;
             /* masked pending */ ;
         } else  {
             icp_irq(ics->icp, irq->server, srcno + ics->offset, irq->priority);
@@ -228,7 +231,11 @@ static void set_irq_lsi(struct ics_state *ics, int srcno, int val)
 {
     struct ics_irq_state *irq = ics->irqs + srcno;
 
-    irq->asserted = val;
+    if (val) {
+        irq->status |= XICS_STATUS_ASSERTED;
+    } else {
+        irq->status &= ~XICS_STATUS_ASSERTED;
+    }
     resend_lsi(ics, srcno);
 }
 
@@ -248,11 +255,12 @@ static void write_xive_msi(struct ics_state *ics, int srcno)
 {
     struct ics_irq_state *irq = ics->irqs + srcno;
 
-    if (!irq->masked_pending || (irq->priority == 0xff)) {
+    if (!(irq->status & XICS_STATUS_MASKED_PENDING)
+        || (irq->priority == 0xff)) {
         return;
     }
 
-    irq->masked_pending = 0;
+    irq->status &= ~XICS_STATUS_MASKED_PENDING;
     icp_irq(ics->icp, irq->server, srcno + ics->offset, irq->priority);
 }
 
@@ -281,8 +289,8 @@ static void ics_reject(struct ics_state *ics, int nr)
 {
     struct ics_irq_state *irq = ics->irqs + nr - ics->offset;
 
-    irq->rejected = 1; /* Irrelevant but harmless for LSI */
-    irq->sent = 0; /* Irrelevant but harmless for MSI */
+    irq->status |= XICS_STATUS_REJECTED; /* Irrelevant but harmless for LSI */
+    irq->status &= ~XICS_STATUS_SENT; /* Irrelevant but harmless for MSI */
 }
 
 static void ics_resend(struct ics_state *ics)
@@ -307,7 +315,7 @@ static void ics_eoi(struct ics_state *ics, int nr)
     struct ics_irq_state *irq = ics->irqs + srcno;
 
     if (irq->type == XICS_LSI) {
-        irq->sent = 0;
+        irq->status &= ~XICS_STATUS_SENT;
     }
 }
 
@@ -506,10 +514,7 @@ static void xics_reset(void *opaque)
     for (i = 0; i < ics->nr_irqs; i++) {
         /* Reset everything *except* the type */
         ics->irqs[i].server = 0;
-        ics->irqs[i].asserted = 0;
-        ics->irqs[i].sent = 0;
-        ics->irqs[i].rejected = 0;
-        ics->irqs[i].masked_pending = 0;
+        ics->irqs[i].status = 0;
         ics->irqs[i].priority = 0xff;
         ics->irqs[i].saved_priority = 0xff;
     }
-- 
cgit v1.1


From ff9d2afa618acd81d926c9c213b4ff5f7163db1d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:18 +0000
Subject: pseries: Remove XICS irq type enum type

Currently the XICS interrupt controller emulation uses a custom enum to
specify whether a given interrupt is level-sensitive or message-triggered.
This enum makes life awkward for saving the state, and isn't particularly
useful since there are only two possibilities.  This patch replaces the
enum with a simple bool.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr.c     |  8 ++++----
 hw/spapr.h     |  8 ++++----
 hw/spapr_pci.c |  2 +-
 hw/xics.c      | 16 +++++++---------
 hw/xics.h      |  8 +-------
 5 files changed, 17 insertions(+), 25 deletions(-)

(limited to 'hw')

diff --git a/hw/spapr.c b/hw/spapr.c
index 0a0e9cd..1177efa 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -89,7 +89,7 @@
 
 sPAPREnvironment *spapr;
 
-int spapr_allocate_irq(int hint, enum xics_irq_type type)
+int spapr_allocate_irq(int hint, bool lsi)
 {
     int irq;
 
@@ -105,13 +105,13 @@ int spapr_allocate_irq(int hint, enum xics_irq_type type)
         return 0;
     }
 
-    xics_set_irq_type(spapr->icp, irq, type);
+    xics_set_irq_type(spapr->icp, irq, lsi);
 
     return irq;
 }
 
 /* Allocate block of consequtive IRQs, returns a number of the first */
-int spapr_allocate_irq_block(int num, enum xics_irq_type type)
+int spapr_allocate_irq_block(int num, bool lsi)
 {
     int first = -1;
     int i;
@@ -119,7 +119,7 @@ int spapr_allocate_irq_block(int num, enum xics_irq_type type)
     for (i = 0; i < num; ++i) {
         int irq;
 
-        irq = spapr_allocate_irq(0, type);
+        irq = spapr_allocate_irq(0, lsi);
         if (!irq) {
             return -1;
         }
diff --git a/hw/spapr.h b/hw/spapr.h
index f9a7b0f..51a966b 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -291,17 +291,17 @@ void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
 target_ulong spapr_hypercall(CPUPPCState *env, target_ulong opcode,
                              target_ulong *args);
 
-int spapr_allocate_irq(int hint, enum xics_irq_type type);
-int spapr_allocate_irq_block(int num, enum xics_irq_type type);
+int spapr_allocate_irq(int hint, bool lsi);
+int spapr_allocate_irq_block(int num, bool lsi);
 
 static inline int spapr_allocate_msi(int hint)
 {
-    return spapr_allocate_irq(hint, XICS_MSI);
+    return spapr_allocate_irq(hint, false);
 }
 
 static inline int spapr_allocate_lsi(int hint)
 {
-    return spapr_allocate_irq(hint, XICS_LSI);
+    return spapr_allocate_irq(hint, true);
 }
 
 static inline uint32_t rtas_ld(target_ulong phys, int n)
diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
index 203155e..b628f89 100644
--- a/hw/spapr_pci.c
+++ b/hw/spapr_pci.c
@@ -351,7 +351,7 @@ static void rtas_ibm_change_msi(sPAPREnvironment *spapr,
 
     /* There is no cached config, allocate MSIs */
     if (!phb->msi_table[ndev].nvec) {
-        irq = spapr_allocate_irq_block(req_num, XICS_MSI);
+        irq = spapr_allocate_irq_block(req_num, true);
         if (irq < 0) {
             fprintf(stderr, "Cannot allocate MSIs for device#%d", ndev);
             rtas_st(rets, 0, -1); /* Hardware error */
diff --git a/hw/xics.c b/hw/xics.c
index 648af25..75c8cca 100644
--- a/hw/xics.c
+++ b/hw/xics.c
@@ -170,7 +170,7 @@ struct ics_irq_state {
 #define XICS_STATUS_REJECTED           0x4
 #define XICS_STATUS_MASKED_PENDING     0x8
     uint8_t status;
-    enum xics_irq_type type;
+    bool lsi;
 };
 
 struct ics_state {
@@ -244,7 +244,7 @@ static void ics_set_irq(void *opaque, int srcno, int val)
     struct ics_state *ics = (struct ics_state *)opaque;
     struct ics_irq_state *irq = ics->irqs + srcno;
 
-    if (irq->type == XICS_LSI) {
+    if (irq->lsi) {
         set_irq_lsi(ics, srcno, val);
     } else {
         set_irq_msi(ics, srcno, val);
@@ -278,7 +278,7 @@ static void ics_write_xive(struct ics_state *ics, int nr, int server,
     irq->server = server;
     irq->priority = priority;
 
-    if (irq->type == XICS_LSI) {
+    if (irq->lsi) {
         write_xive_lsi(ics, srcno);
     } else {
         write_xive_msi(ics, srcno);
@@ -301,7 +301,7 @@ static void ics_resend(struct ics_state *ics)
         struct ics_irq_state *irq = ics->irqs + i;
 
         /* FIXME: filter by server#? */
-        if (irq->type == XICS_LSI) {
+        if (irq->lsi) {
             resend_lsi(ics, i);
         } else {
             resend_msi(ics, i);
@@ -314,7 +314,7 @@ static void ics_eoi(struct ics_state *ics, int nr)
     int srcno = nr - ics->offset;
     struct ics_irq_state *irq = ics->irqs + srcno;
 
-    if (irq->type == XICS_LSI) {
+    if (irq->lsi) {
         irq->status &= ~XICS_STATUS_SENT;
     }
 }
@@ -333,14 +333,12 @@ qemu_irq xics_get_qirq(struct icp_state *icp, int irq)
     return icp->ics->qirqs[irq - icp->ics->offset];
 }
 
-void xics_set_irq_type(struct icp_state *icp, int irq,
-                       enum xics_irq_type type)
+void xics_set_irq_type(struct icp_state *icp, int irq, bool lsi)
 {
     assert((irq >= icp->ics->offset)
            && (irq < (icp->ics->offset + icp->ics->nr_irqs)));
-    assert((type == XICS_MSI) || (type == XICS_LSI));
 
-    icp->ics->irqs[irq - icp->ics->offset].type = type;
+    icp->ics->irqs[irq - icp->ics->offset].lsi = lsi;
 }
 
 static target_ulong h_cppr(CPUPPCState *env, sPAPREnvironment *spapr,
diff --git a/hw/xics.h b/hw/xics.h
index 99b96ac..6817268 100644
--- a/hw/xics.h
+++ b/hw/xics.h
@@ -31,14 +31,8 @@
 
 struct icp_state;
 
-enum xics_irq_type {
-    XICS_MSI,        /* Message-signalled (edge) interrupt */
-    XICS_LSI,        /* Level-signalled interrupt */
-};
-
 qemu_irq xics_get_qirq(struct icp_state *icp, int irq);
-void xics_set_irq_type(struct icp_state *icp, int irq,
-                       enum xics_irq_type type);
+void xics_set_irq_type(struct icp_state *icp, int irq, bool lsi);
 
 struct icp_state *xics_system_init(int nr_irqs);
 
-- 
cgit v1.1


From 490d4a2b6edafb27cd688ded7fdb1290453d71b0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:19 +0000
Subject: pseries: Remove never used flags field from spapr vio devices

The general device state structure for PAPR VIO emulated devices includes a
'flags' field which was never used.  This patch removes it.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr_vio.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'hw')

diff --git a/hw/spapr_vio.h b/hw/spapr_vio.h
index ea6aa43..acef65e 100644
--- a/hw/spapr_vio.h
+++ b/hw/spapr_vio.h
@@ -60,7 +60,6 @@ typedef struct VIOsPAPRDeviceClass {
 struct VIOsPAPRDevice {
     DeviceState qdev;
     uint32_t reg;
-    uint32_t flags;
     uint32_t irq;
     target_ulong signal_state;
     VIOsPAPR_CRQ crq;
-- 
cgit v1.1


From 53724ee565565f69560dbe17553bede8c0169379 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:20 +0000
Subject: pseries: Rework implementation of TCE bypass

On the pseries machine the IOMMU (aka TCE tables) is always active for all
PCI and VIO devices.  Mostly to simplify the SLOF firmware, we implement an
extension which allows the IOMMU to be temporarily disabled for certain
devices.

Currently this is implemented by setting the device's DMAContext pointer to
NULL (thus reverting to qemu's default no-IOMMU DMA behaviour), then
replacing it when bypass mode is disabled.

This approach causes a bunch of complications though.  It complexifies the
management of the DMAContext lifetimes, it's problematic for savevm/loadvm,
and it means that while bypass is active we have nowhere to store the
device's LIOBN (Logical IO Bus Number, used to identify DMA address
spaces).  At present we regenerate the LIOBN from other address information
but this restricts how we can allocate LIOBNs.

This patch gives up on this approach, replacing it with the much simpler
one of having a 'bypass' boolean flag in the TCE state structure.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr.h       |  1 +
 hw/spapr_iommu.c | 25 +++++++++++++++++++------
 hw/spapr_vio.c   | 26 ++++++++++----------------
 hw/spapr_vio.h   |  1 -
 4 files changed, 30 insertions(+), 23 deletions(-)

(limited to 'hw')

diff --git a/hw/spapr.h b/hw/spapr.h
index 51a966b..e984e3f 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -339,6 +339,7 @@ void spapr_iommu_init(void);
 DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size);
 void spapr_tce_free(DMAContext *dma);
 void spapr_tce_reset(DMAContext *dma);
+void spapr_tce_set_bypass(DMAContext *dma, bool bypass);
 int spapr_dma_dt(void *fdt, int node_off, const char *propname,
                  uint32_t liobn, uint64_t window, uint32_t size);
 int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
index 216aa06..38034c0 100644
--- a/hw/spapr_iommu.c
+++ b/hw/spapr_iommu.c
@@ -42,6 +42,7 @@ struct sPAPRTCETable {
     uint32_t liobn;
     uint32_t window_size;
     sPAPRTCE *table;
+    bool bypass;
     int fd;
     QLIST_ENTRY(sPAPRTCETable) list;
 };
@@ -78,6 +79,12 @@ static int spapr_tce_translate(DMAContext *dma,
             DMA_ADDR_FMT "\n", tcet->liobn, addr);
 #endif
 
+    if (tcet->bypass) {
+        *paddr = addr;
+        *len = (target_phys_addr_t)-1;
+        return 0;
+    }
+
     /* Check if we are in bound */
     if (addr >= tcet->window_size) {
 #ifdef DEBUG_TCE
@@ -162,15 +169,21 @@ void spapr_tce_free(DMAContext *dma)
     }
 }
 
+void spapr_tce_set_bypass(DMAContext *dma, bool bypass)
+{
+    sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, dma);
+
+    tcet->bypass = bypass;
+}
+
 void spapr_tce_reset(DMAContext *dma)
 {
-    if (dma) {
-        sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, dma);
-        size_t table_size = (tcet->window_size >> SPAPR_TCE_PAGE_SHIFT)
-            * sizeof(sPAPRTCE);
+    sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, dma);
+    size_t table_size = (tcet->window_size >> SPAPR_TCE_PAGE_SHIFT)
+        * sizeof(sPAPRTCE);
 
-        memset(tcet->table, 0, table_size);
-    }
+    tcet->bypass = false;
+    memset(tcet->table, 0, table_size);
 }
 
 static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba,
diff --git a/hw/spapr_vio.c b/hw/spapr_vio.c
index 752836e..848806d 100644
--- a/hw/spapr_vio.c
+++ b/hw/spapr_vio.c
@@ -316,14 +316,9 @@ int spapr_vio_send_crq(VIOsPAPRDevice *dev, uint8_t *crq)
 
 static void spapr_vio_quiesce_one(VIOsPAPRDevice *dev)
 {
-    VIOsPAPRDeviceClass *pc = VIO_SPAPR_DEVICE_GET_CLASS(dev);
-    uint32_t liobn = SPAPR_VIO_BASE_LIOBN | dev->reg;
-
     if (dev->dma) {
-        spapr_tce_free(dev->dma);
+        spapr_tce_reset(dev->dma);
     }
-    dev->dma = spapr_tce_new_dma_context(liobn, pc->rtce_window_size);
-
     free_crq(dev);
 }
 
@@ -346,16 +341,14 @@ static void rtas_set_tce_bypass(sPAPREnvironment *spapr, uint32_t token,
         rtas_st(rets, 0, -3);
         return;
     }
-    if (enable) {
-        spapr_tce_free(dev->dma);
-        dev->dma = NULL;
-    } else {
-        VIOsPAPRDeviceClass *pc = VIO_SPAPR_DEVICE_GET_CLASS(dev);
-        uint32_t liobn = SPAPR_VIO_BASE_LIOBN | dev->reg;
 
-        dev->dma = spapr_tce_new_dma_context(liobn, pc->rtce_window_size);
+    if (!dev->dma) {
+        rtas_st(rets, 0, -3);
+        return;
     }
 
+    spapr_tce_set_bypass(dev->dma, !!enable);
+
     rtas_st(rets, 0, 0);
 }
 
@@ -421,7 +414,6 @@ static int spapr_vio_busdev_init(DeviceState *qdev)
 {
     VIOsPAPRDevice *dev = (VIOsPAPRDevice *)qdev;
     VIOsPAPRDeviceClass *pc = VIO_SPAPR_DEVICE_GET_CLASS(dev);
-    uint32_t liobn;
     char *id;
 
     if (dev->reg != -1) {
@@ -463,8 +455,10 @@ static int spapr_vio_busdev_init(DeviceState *qdev)
         return -1;
     }
 
-    liobn = SPAPR_VIO_BASE_LIOBN | dev->reg;
-    dev->dma = spapr_tce_new_dma_context(liobn, pc->rtce_window_size);
+    if (pc->rtce_window_size) {
+        uint32_t liobn = SPAPR_VIO_BASE_LIOBN | dev->reg;
+        dev->dma = spapr_tce_new_dma_context(liobn, pc->rtce_window_size);
+    }
 
     return pc->init(dev);
 }
diff --git a/hw/spapr_vio.h b/hw/spapr_vio.h
index acef65e..cc85d26 100644
--- a/hw/spapr_vio.h
+++ b/hw/spapr_vio.h
@@ -131,7 +131,6 @@ void spapr_vscsi_create(VIOsPAPRBus *bus);
 
 VIOsPAPRDevice *spapr_vty_get_default(VIOsPAPRBus *bus);
 
-int spapr_tce_set_bypass(uint32_t unit, uint32_t enable);
 void spapr_vio_quiesce(void);
 
 #endif /* _HW_SPAPR_VIO_H */
-- 
cgit v1.1


From 3fe719f467530b7c8ac0797881ff4b66d1357c18 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 12 Sep 2012 16:57:21 +0000
Subject: pseries: Fix semantics of RTAS int-on, int-off and set-xive functions

Currently the ibm,int-on and ibm,int-off RTAS functions are implemented as
no-ops.  This is because when implemented as specified in PAPR they caused
Linux (which calls both int-on/off and set-xive) to end up with interrupts
masked when they should not be.  Since Linux's set-xive calls make the
int-on/off calls redundant, making them nops worked around the problem.

In fact, the problem was caused because there was a subtle bug in set-xive,
PAPR specifies that as well as updating the current priority, it also needs
to update the saved priority used by int-on/off.  With this bug fixed the
problem goes away.  This patch implements this more correct fix.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/xics.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'hw')

diff --git a/hw/xics.c b/hw/xics.c
index 75c8cca..ce88aa7 100644
--- a/hw/xics.c
+++ b/hw/xics.c
@@ -270,13 +270,14 @@ static void write_xive_lsi(struct ics_state *ics, int srcno)
 }
 
 static void ics_write_xive(struct ics_state *ics, int nr, int server,
-                           uint8_t priority)
+                           uint8_t priority, uint8_t saved_priority)
 {
     int srcno = nr - ics->offset;
     struct ics_irq_state *irq = ics->irqs + srcno;
 
     irq->server = server;
     irq->priority = priority;
+    irq->saved_priority = saved_priority;
 
     if (irq->lsi) {
         write_xive_lsi(ics, srcno);
@@ -405,7 +406,7 @@ static void rtas_set_xive(sPAPREnvironment *spapr, uint32_t token,
         return;
     }
 
-    ics_write_xive(ics, nr, server, priority);
+    ics_write_xive(ics, nr, server, priority, priority);
 
     rtas_st(rets, 0, 0); /* Success */
 }
@@ -453,14 +454,8 @@ static void rtas_int_off(sPAPREnvironment *spapr, uint32_t token,
         return;
     }
 
-    /* This is a NOP for now, since the described PAPR semantics don't
-     * seem to gel with what Linux does */
-#if 0
-    struct ics_irq_state *irq = xics->irqs + (nr - xics->offset);
-
-    irq->saved_priority = irq->priority;
-    ics_write_xive_msi(xics, nr, irq->server, 0xff);
-#endif
+    ics_write_xive(ics, nr, ics->irqs[nr - ics->offset].server, 0xff,
+                   ics->irqs[nr - ics->offset].priority);
 
     rtas_st(rets, 0, 0); /* Success */
 }
@@ -484,13 +479,9 @@ static void rtas_int_on(sPAPREnvironment *spapr, uint32_t token,
         return;
     }
 
-    /* This is a NOP for now, since the described PAPR semantics don't
-     * seem to gel with what Linux does */
-#if 0
-    struct ics_irq_state *irq = xics->irqs + (nr - xics->offset);
-
-    ics_write_xive_msi(xics, nr, irq->server, irq->saved_priority);
-#endif
+    ics_write_xive(ics, nr, ics->irqs[nr - ics->offset].server,
+                   ics->irqs[nr - ics->offset].saved_priority,
+                   ics->irqs[nr - ics->offset].saved_priority);
 
     rtas_st(rets, 0, 0); /* Success */
 }
-- 
cgit v1.1


From 5a1972c8472fafd519a68b689fdcaf33ec857945 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Fri, 31 Aug 2012 22:21:21 +0200
Subject: ppc405_uc: Fix buffer overflow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Report from smatch:

ppc405_uc.c:209 dcr_read_pob(12) error: buffer overflow 'pob->besr' 2 <= 2
ppc405_uc.c:232 dcr_write_pob(12) error: buffer overflow 'pob->besr' 2 <= 2

The old code reads and writes besr[POB0_BESR1 - POB0_BESR0] or besr[2]
which is one too much.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
Reviewed-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/ppc405_uc.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'hw')

diff --git a/hw/ppc405_uc.c b/hw/ppc405_uc.c
index 89e5013..b52ab2f 100644
--- a/hw/ppc405_uc.c
+++ b/hw/ppc405_uc.c
@@ -191,7 +191,8 @@ enum {
 typedef struct ppc4xx_pob_t ppc4xx_pob_t;
 struct ppc4xx_pob_t {
     uint32_t bear;
-    uint32_t besr[2];
+    uint32_t besr0;
+    uint32_t besr1;
 };
 
 static uint32_t dcr_read_pob (void *opaque, int dcrn)
@@ -205,8 +206,10 @@ static uint32_t dcr_read_pob (void *opaque, int dcrn)
         ret = pob->bear;
         break;
     case POB0_BESR0:
+        ret = pob->besr0;
+        break;
     case POB0_BESR1:
-        ret = pob->besr[dcrn - POB0_BESR0];
+        ret = pob->besr1;
         break;
     default:
         /* Avoid gcc warning */
@@ -227,9 +230,12 @@ static void dcr_write_pob (void *opaque, int dcrn, uint32_t val)
         /* Read only */
         break;
     case POB0_BESR0:
+        /* Write-clear */
+        pob->besr0 &= ~val;
+        break;
     case POB0_BESR1:
         /* Write-clear */
-        pob->besr[dcrn - POB0_BESR0] &= ~val;
+        pob->besr1 &= ~val;
         break;
     }
 }
@@ -241,8 +247,8 @@ static void ppc4xx_pob_reset (void *opaque)
     pob = opaque;
     /* No error */
     pob->bear = 0x00000000;
-    pob->besr[0] = 0x0000000;
-    pob->besr[1] = 0x0000000;
+    pob->besr0 = 0x0000000;
+    pob->besr1 = 0x0000000;
 }
 
 static void ppc4xx_pob_init(CPUPPCState *env)
-- 
cgit v1.1


From 35f9304d925a5423c51bd2c83a81fa3cc2b6e680 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 20 Sep 2012 17:42:30 +0000
Subject: pseries: Remove unnecessary locking from PAPR hash table hcalls

In the paravirtualized environment provided by PAPR, there is a standard
locking scheme so that hypercalls updating the hash page table from
different guest threads don't corrupt the haah table state.  We implement
this HVLOCK bit in out page table hypercalls.  However, it is not necessary
in our case, since the hypercalls all run in the qemu environment under the
big qemu lock.

Therefore, this patch removes the locking code.  This has the additional
advantage of freeing up a hash PTE bit which will be useful for migration
support.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr_hcall.c | 42 ++++--------------------------------------
 1 file changed, 4 insertions(+), 38 deletions(-)

(limited to 'hw')

diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
index 2df94d1..826ca67 100644
--- a/hw/spapr_hcall.c
+++ b/hw/spapr_hcall.c
@@ -39,22 +39,6 @@
 #define HPTE_V_1TB_SEG          0x4000000000000000ULL
 #define HPTE_V_VRMA_MASK        0x4001ffffff000000ULL
 
-#define HPTE_V_HVLOCK           0x40ULL
-
-static inline int lock_hpte(void *hpte, target_ulong bits)
-{
-    uint64_t pteh;
-
-    pteh = ldq_p(hpte);
-
-    /* We're protected by qemu's global lock here */
-    if (pteh & bits) {
-        return 0;
-    }
-    stq_p(hpte, pteh | HPTE_V_HVLOCK);
-    return 1;
-}
-
 static target_ulong compute_tlbie_rb(target_ulong v, target_ulong r,
                                      target_ulong pte_index)
 {
@@ -151,8 +135,7 @@ static target_ulong h_enter(CPUPPCState *env, sPAPREnvironment *spapr,
             if (i == 8) {
                 return H_PTEG_FULL;
             }
-            if (((ldq_p(hpte) & HPTE_V_VALID) == 0) &&
-                lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) {
+            if ((ldq_p(hpte) & HPTE_V_VALID) == 0) {
                 break;
             }
             hpte += HASH_PTE_SIZE_64;
@@ -160,7 +143,7 @@ static target_ulong h_enter(CPUPPCState *env, sPAPREnvironment *spapr,
     } else {
         i = 0;
         hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
-        if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) {
+        if (ldq_p(hpte) & HPTE_V_VALID) {
             return H_PTEG_FULL;
         }
     }
@@ -168,7 +151,6 @@ static target_ulong h_enter(CPUPPCState *env, sPAPREnvironment *spapr,
     /* eieio();  FIXME: need some sort of barrier for smp? */
     stq_p(hpte, pteh);
 
-    assert(!(ldq_p(hpte) & HPTE_V_HVLOCK));
     args[0] = pte_index + i;
     return H_SUCCESS;
 }
@@ -193,11 +175,6 @@ static target_ulong remove_hpte(CPUPPCState *env, target_ulong ptex,
     }
 
     hpte = env->external_htab + (ptex * HASH_PTE_SIZE_64);
-    while (!lock_hpte(hpte, HPTE_V_HVLOCK)) {
-        /* We have no real concurrency in qemu soft-emulation, so we
-         * will never actually have a contested lock */
-        assert(0);
-    }
 
     v = ldq_p(hpte);
     r = ldq_p(hpte + (HASH_PTE_SIZE_64/2));
@@ -205,16 +182,13 @@ static target_ulong remove_hpte(CPUPPCState *env, target_ulong ptex,
     if ((v & HPTE_V_VALID) == 0 ||
         ((flags & H_AVPN) && (v & ~0x7fULL) != avpn) ||
         ((flags & H_ANDCOND) && (v & avpn) != 0)) {
-        stq_p(hpte, v & ~HPTE_V_HVLOCK);
-        assert(!(ldq_p(hpte) & HPTE_V_HVLOCK));
         return REMOVE_NOT_FOUND;
     }
-    *vp = v & ~HPTE_V_HVLOCK;
+    *vp = v;
     *rp = r;
     stq_p(hpte, 0);
     rb = compute_tlbie_rb(v, r, ptex);
     ppc_tlb_invalidate_one(env, rb);
-    assert(!(ldq_p(hpte) & HPTE_V_HVLOCK));
     return REMOVE_SUCCESS;
 }
 
@@ -324,19 +298,12 @@ static target_ulong h_protect(CPUPPCState *env, sPAPREnvironment *spapr,
     }
 
     hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
-    while (!lock_hpte(hpte, HPTE_V_HVLOCK)) {
-        /* We have no real concurrency in qemu soft-emulation, so we
-         * will never actually have a contested lock */
-        assert(0);
-    }
 
     v = ldq_p(hpte);
     r = ldq_p(hpte + (HASH_PTE_SIZE_64/2));
 
     if ((v & HPTE_V_VALID) == 0 ||
         ((flags & H_AVPN) && (v & ~0x7fULL) != avpn)) {
-        stq_p(hpte, v & ~HPTE_V_HVLOCK);
-        assert(!(ldq_p(hpte) & HPTE_V_HVLOCK));
         return H_NOT_FOUND;
     }
 
@@ -350,8 +317,7 @@ static target_ulong h_protect(CPUPPCState *env, sPAPREnvironment *spapr,
     ppc_tlb_invalidate_one(env, rb);
     stq_p(hpte + (HASH_PTE_SIZE_64/2), r);
     /* Don't need a memory barrier, due to qemu's global lock */
-    stq_p(hpte, v & ~HPTE_V_HVLOCK);
-    assert(!(ldq_p(hpte) & HPTE_V_HVLOCK));
+    stq_p(hpte, v);
     return H_SUCCESS;
 }
 
-- 
cgit v1.1


From 382be75df77142cf6bdc7f5852738029eeb9e23a Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 20 Sep 2012 17:42:27 +0000
Subject: pseries: Set hash table size based on RAM size

Currently the pseries machine code always attempts to set the size of the
guests's hash page table to 16MB.  However, because of the way the POWER
MMU works, a suitable hash page table size should really depend on memory
size.  16MB will be excessive for guests with <1GB and RAM, and may not be
enough for guests with >2GB of RAM (depending on guest page size and
other factors).

The usual given rule of thumb is that the hash table should be 1/64 of
the size of memory, but in fact the Linux guests we are aiming at don't
really need that much.  This patch, therefore, changes the hash table
allocation code to aim for 1/128 of the size of RAM (rounding up).  When
using KVM, this size may still be adjusted by the host kernel if it is
unable to allocate a suitable (contiguous) table.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'hw')

diff --git a/hw/spapr.c b/hw/spapr.c
index 1177efa..a8bd3c1 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -725,9 +725,16 @@ static void ppc_spapr_init(ram_addr_t ram_size,
     spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
     load_limit = spapr->fdt_addr - FW_OVERHEAD;
 
-    /* For now, always aim for a 16MB hash table */
-    /* FIXME: we should change this default based on RAM size */
-    spapr->htab_shift = 24;
+    /* We aim for a hash table of size 1/128 the size of RAM.  The
+     * normal rule of thumb is 1/64 the size of RAM, but that's much
+     * more than needed for the Linux guests we support. */
+    spapr->htab_shift = 18; /* Minimum architected size */
+    while (spapr->htab_shift <= 46) {
+        if ((1ULL << (spapr->htab_shift + 7)) >= ram_size) {
+            break;
+        }
+        spapr->htab_shift++;
+    }
 
     /* init CPUs */
     if (cpu_model == NULL) {
-- 
cgit v1.1


From 711934334eb3895a89c555c1f57eb3d84ddb2906 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Sun, 23 Sep 2012 08:37:59 +0200
Subject: fdt: move dumpdtb interpretation code to device_tree.c

The dumpdtb code can be useful in more places than just for e500. Move it
to a generic place.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/ppc/e500.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'hw')

diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index 6f0de6d..5bab340 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -139,12 +139,10 @@ static int ppce500_load_device_tree(CPUPPCState *env,
             0x0, 0x10000,
         };
     QemuOpts *machine_opts;
-    const char *dumpdtb = NULL;
     const char *dtb_file = NULL;
 
     machine_opts = qemu_opts_find(qemu_find_opts("machine"), 0);
     if (machine_opts) {
-        dumpdtb = qemu_opt_get(machine_opts, "dumpdtb");
         dtb_file = qemu_opt_get(machine_opts, "dtb");
         toplevel_compat = qemu_opt_get(machine_opts, "dt_compatible");
     }
@@ -334,18 +332,7 @@ static int ppce500_load_device_tree(CPUPPCState *env,
     }
 
 done:
-    if (dumpdtb) {
-        /* Dump the dtb to a file and quit */
-        FILE *f = fopen(dumpdtb, "wb");
-        size_t len;
-        len = fwrite(fdt, fdt_size, 1, f);
-        fclose(f);
-        if (len != fdt_size) {
-            exit(1);
-        }
-        exit(0);
-    }
-
+    qemu_devtree_dumpdtb(fdt, fdt_size);
     ret = rom_add_blob_fixed(BINARY_DEVICE_TREE_FILE, fdt, fdt_size, addr);
     if (ret < 0) {
         goto out;
-- 
cgit v1.1


From 9dd5eba1bc69bccbd83885d157d84e2514799a22 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Wed, 22 Aug 2012 14:55:40 +0000
Subject: PPC: e500: increase DTC_LOAD_PAD

An allowance of 5 MiB for BSS is not enough for Linux kernels with certain
debug options enabled (not sure exactly which one caused it, but I'd guess
lockdep).  The kernel I ran into this with had a BSS of around 6.4 MB.

Unfortunately, uImage does not give us enough information to determine the
actual BSS size.  Increase the allowance to 18 MiB to give us plenty of
room.  Eventually this should be more intelligent, possibly packing
initrd+dtb at the end of guest RAM.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/ppc/e500.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'hw')

diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index 5bab340..fc3fde0 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -36,7 +36,7 @@
 
 #define BINARY_DEVICE_TREE_FILE    "mpc8544ds.dtb"
 #define UIMAGE_LOAD_BASE           0
-#define DTC_LOAD_PAD               0x500000
+#define DTC_LOAD_PAD               0x1800000
 #define DTC_PAD_MASK               0xFFFFF
 #define INITRD_LOAD_PAD            0x2000000
 #define INITRD_PAD_MASK            0xFFFFFF
-- 
cgit v1.1


From 7e7ec2d290ca5b1bdd555da9852dc5ee60232fe5 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Wed, 22 Aug 2012 14:55:41 +0000
Subject: PPC: e500: calculate initrd_base like dt_base

While investigating dtb pad issues, I noticed that initrd_base wasn't taking
loadaddr into account the way dt_base was.  This seems wrong.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/ppc/e500.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'hw')

diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index fc3fde0..feb712e 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -540,7 +540,8 @@ void ppce500_init(PPCE500Params *params)
 
     /* Load initrd. */
     if (params->initrd_filename) {
-        initrd_base = (kernel_size + INITRD_LOAD_PAD) & ~INITRD_PAD_MASK;
+        initrd_base = (loadaddr + kernel_size + INITRD_LOAD_PAD) &
+            ~INITRD_PAD_MASK;
         initrd_size = load_image_targphys(params->initrd_filename, initrd_base,
                                           ram_size - initrd_base);
 
-- 
cgit v1.1


From efcb9383b974114e5f682e531346006f8f2466c0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 25 Sep 2012 17:12:20 +0000
Subject: pseries: Don't test for MSR_PR for hypercalls under KVM

PAPR hypercalls should only be invoked from the guest kernel, not guest
user programs, that is, with MSR[PR]=0.  Currently we check this in
spapr_hypercall, returning H_PRIVILEGE if MSR[PR]=1.

However, under KVM the state of MSR[PR] is already checked by the host
kernel before passing the hypercall to qemu, making this check redundant.
Worse, however, we don't generally synchronize KVM and qemu state on the
hypercall path, meaning that qemu could incorrectly reject a hypercall
because it has a stale MSR value.

This patch fixes the problem by moving the privilege test exclusively to
the TCG hypercall path.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
CC: qemu-stable@nongnu.org
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/spapr.c       | 7 ++++++-
 hw/spapr_hcall.c | 5 -----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'hw')

diff --git a/hw/spapr.c b/hw/spapr.c
index a8bd3c1..ab227a0 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -573,7 +573,12 @@ static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
 
 static void emulate_spapr_hypercall(CPUPPCState *env)
 {
-    env->gpr[3] = spapr_hypercall(env, env->gpr[3], &env->gpr[4]);
+    if (msr_pr) {
+        hcall_dprintf("Hypercall made with MSR[PR]=1\n");
+        env->gpr[3] = H_PRIVILEGE;
+    } else {
+        env->gpr[3] = spapr_hypercall(env, env->gpr[3], &env->gpr[4]);
+    }
 }
 
 static void spapr_reset_htab(sPAPREnvironment *spapr)
diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
index 826ca67..194d9c2 100644
--- a/hw/spapr_hcall.c
+++ b/hw/spapr_hcall.c
@@ -681,11 +681,6 @@ void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn)
 target_ulong spapr_hypercall(CPUPPCState *env, target_ulong opcode,
                              target_ulong *args)
 {
-    if (msr_pr) {
-        hcall_dprintf("Hypercall made with MSR[PR]=1\n");
-        return H_PRIVILEGE;
-    }
-
     if ((opcode <= MAX_HCALL_OPCODE)
         && ((opcode & 0x3) == 0)) {
         spapr_hcall_fn fn = papr_hypercall_table[opcode / 4];
-- 
cgit v1.1


From fb37c3029c5a695e367baaacc6baf17640cc63cc Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 4 Oct 2012 18:52:21 +0200
Subject: PPC: e500: Only expose even TLB sizes in initial TLB

When booting our e500 machine, we automatically generate a big TLB entry
in TLB1 that covers all of the code we need to run in there until the guest
can handle its TLB on its own.

However, e500v2 can only handle MAS1.0 sizes. However, we keep our TLB
information in MAS2.0 layout, which means we have twice as many TLB sizes
to choose from. That also means we can run into a situation where we try
to add a TLB size that could not fit into the MAS1.0 size bits.

Fix it by making sure we always have the lower bit set to 0. That way we
are always guaranteed to have MAS1.0 compatible TLB size information.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/ppc/e500.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'hw')

diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c
index feb712e..d23f9b2 100644
--- a/hw/ppc/e500.c
+++ b/hw/ppc/e500.c
@@ -362,6 +362,10 @@ static void mmubooke_create_initial_mapping(CPUPPCState *env)
        the device tree top */
     dt_end = bi->dt_base + bi->dt_size;
     ps = booke206_page_size_to_tlb(dt_end) + 1;
+    if (ps & 1) {
+        /* e500v2 can only do even TLB size bits */
+        ps++;
+    }
     size = (ps << MAS1_TSIZE_SHIFT);
     tlb->mas1 = MAS1_VALID | size;
     tlb->mas2 = 0;
-- 
cgit v1.1