#include "sysemu/hw_accel.h"
#include "sysemu/sysemu.h"
#include "qemu/log.h"
+#include "qemu/error-report.h"
#include "cpu.h"
#include "exec/exec-all.h"
#include "helper_regs.h"
return H_SUCCESS;
}
+struct sPAPRPendingHPT {
+ /* These fields are read-only after initialization */
+ int shift;
+ QemuThread thread;
+
+ /* These fields are protected by the BQL */
+ bool complete;
+
+ /* These fields are private to the preparation thread if
+ * !complete, otherwise protected by the BQL */
+ int ret;
+ void *hpt;
+};
+
+static void free_pending_hpt(sPAPRPendingHPT *pending)
+{
+ if (pending->hpt) {
+ qemu_vfree(pending->hpt);
+ }
+
+ g_free(pending);
+}
+
+static void *hpt_prepare_thread(void *opaque)
+{
+ sPAPRPendingHPT *pending = opaque;
+ size_t size = 1ULL << pending->shift;
+
+ pending->hpt = qemu_memalign(size, size);
+ if (pending->hpt) {
+ memset(pending->hpt, 0, size);
+ pending->ret = H_SUCCESS;
+ } else {
+ pending->ret = H_NO_MEM;
+ }
+
+ qemu_mutex_lock_iothread();
+
+ if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt == pending) {
+ /* Ready to go */
+ pending->complete = true;
+ } else {
+ /* We've been cancelled, clean ourselves up */
+ free_pending_hpt(pending);
+ }
+
+ qemu_mutex_unlock_iothread();
+ return NULL;
+}
+
+/* Must be called with BQL held */
+static void cancel_hpt_prepare(sPAPRMachineState *spapr)
+{
+ sPAPRPendingHPT *pending = spapr->pending_hpt;
+
+ /* Let the thread know it's cancelled */
+ spapr->pending_hpt = NULL;
+
+ if (!pending) {
+ /* Nothing to do */
+ return;
+ }
+
+ if (!pending->complete) {
+ /* thread will clean itself up */
+ return;
+ }
+
+ free_pending_hpt(pending);
+}
+
+/* Convert a return code from the KVM ioctl()s implementing resize HPT
+ * into a PAPR hypercall return code */
+static target_ulong resize_hpt_convert_rc(int ret)
+{
+ if (ret >= 100000) {
+ return H_LONG_BUSY_ORDER_100_SEC;
+ } else if (ret >= 10000) {
+ return H_LONG_BUSY_ORDER_10_SEC;
+ } else if (ret >= 1000) {
+ return H_LONG_BUSY_ORDER_1_SEC;
+ } else if (ret >= 100) {
+ return H_LONG_BUSY_ORDER_100_MSEC;
+ } else if (ret >= 10) {
+ return H_LONG_BUSY_ORDER_10_MSEC;
+ } else if (ret > 0) {
+ return H_LONG_BUSY_ORDER_1_MSEC;
+ }
+
+ switch (ret) {
+ case 0:
+ return H_SUCCESS;
+ case -EPERM:
+ return H_AUTHORITY;
+ case -EINVAL:
+ return H_PARAMETER;
+ case -ENXIO:
+ return H_CLOSED;
+ case -ENOSPC:
+ return H_PTEG_FULL;
+ case -EBUSY:
+ return H_BUSY;
+ case -ENOMEM:
+ return H_NO_MEM;
+ default:
+ return H_HARDWARE;
+ }
+}
+
+static target_ulong h_resize_hpt_prepare(PowerPCCPU *cpu,
+ sPAPRMachineState *spapr,
+ target_ulong opcode,
+ target_ulong *args)
+{
+ target_ulong flags = args[0];
+ int shift = args[1];
+ sPAPRPendingHPT *pending = spapr->pending_hpt;
+ uint64_t current_ram_size = MACHINE(spapr)->ram_size;
+ int rc;
+
+ if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
+ return H_AUTHORITY;
+ }
+
+ if (!spapr->htab_shift) {
+ /* Radix guest, no HPT */
+ return H_NOT_AVAILABLE;
+ }
+
+ trace_spapr_h_resize_hpt_prepare(flags, shift);
+
+ if (flags != 0) {
+ return H_PARAMETER;
+ }
+
+ if (shift && ((shift < 18) || (shift > 46))) {
+ return H_PARAMETER;
+ }
+
+ current_ram_size = pc_existing_dimms_capacity(&error_fatal);
+
+ /* We only allow the guest to allocate an HPT one order above what
+ * we'd normally give them (to stop a small guest claiming a huge
+ * chunk of resources in the HPT */
+ if (shift > (spapr_hpt_shift_for_ramsize(current_ram_size) + 1)) {
+ return H_RESOURCE;
+ }
+
+ rc = kvmppc_resize_hpt_prepare(cpu, flags, shift);
+ if (rc != -ENOSYS) {
+ return resize_hpt_convert_rc(rc);
+ }
+
+ if (pending) {
+ /* something already in progress */
+ if (pending->shift == shift) {
+ /* and it's suitable */
+ if (pending->complete) {
+ return pending->ret;
+ } else {
+ return H_LONG_BUSY_ORDER_100_MSEC;
+ }
+ }
+
+ /* not suitable, cancel and replace */
+ cancel_hpt_prepare(spapr);
+ }
+
+ if (!shift) {
+ /* nothing to do */
+ return H_SUCCESS;
+ }
+
+ /* start new prepare */
+
+ pending = g_new0(sPAPRPendingHPT, 1);
+ pending->shift = shift;
+ pending->ret = H_HARDWARE;
+
+ qemu_thread_create(&pending->thread, "sPAPR HPT prepare",
+ hpt_prepare_thread, pending, QEMU_THREAD_DETACHED);
+
+ spapr->pending_hpt = pending;
+
+ /* In theory we could estimate the time more accurately based on
+ * the new size, but there's not much point */
+ return H_LONG_BUSY_ORDER_100_MSEC;
+}
+
+static uint64_t new_hpte_load0(void *htab, uint64_t pteg, int slot)
+{
+ uint8_t *addr = htab;
+
+ addr += pteg * HASH_PTEG_SIZE_64;
+ addr += slot * HASH_PTE_SIZE_64;
+ return ldq_p(addr);
+}
+
+static void new_hpte_store(void *htab, uint64_t pteg, int slot,
+ uint64_t pte0, uint64_t pte1)
+{
+ uint8_t *addr = htab;
+
+ addr += pteg * HASH_PTEG_SIZE_64;
+ addr += slot * HASH_PTE_SIZE_64;
+
+ stq_p(addr, pte0);
+ stq_p(addr + HASH_PTE_SIZE_64 / 2, pte1);
+}
+
+static int rehash_hpte(PowerPCCPU *cpu,
+ const ppc_hash_pte64_t *hptes,
+ void *old_hpt, uint64_t oldsize,
+ void *new_hpt, uint64_t newsize,
+ uint64_t pteg, int slot)
+{
+ uint64_t old_hash_mask = (oldsize >> 7) - 1;
+ uint64_t new_hash_mask = (newsize >> 7) - 1;
+ target_ulong pte0 = ppc_hash64_hpte0(cpu, hptes, slot);
+ target_ulong pte1;
+ uint64_t avpn;
+ unsigned base_pg_shift;
+ uint64_t hash, new_pteg, replace_pte0;
+
+ if (!(pte0 & HPTE64_V_VALID) || !(pte0 & HPTE64_V_BOLTED)) {
+ return H_SUCCESS;
+ }
+
+ pte1 = ppc_hash64_hpte1(cpu, hptes, slot);
+
+ base_pg_shift = ppc_hash64_hpte_page_shift_noslb(cpu, pte0, pte1);
+ assert(base_pg_shift); /* H_ENTER shouldn't allow a bad encoding */
+ avpn = HPTE64_V_AVPN_VAL(pte0) & ~(((1ULL << base_pg_shift) - 1) >> 23);
+
+ if (pte0 & HPTE64_V_SECONDARY) {
+ pteg = ~pteg;
+ }
+
+ if ((pte0 & HPTE64_V_SSIZE) == HPTE64_V_SSIZE_256M) {
+ uint64_t offset, vsid;
+
+ /* We only have 28 - 23 bits of offset in avpn */
+ offset = (avpn & 0x1f) << 23;
+ vsid = avpn >> 5;
+ /* We can find more bits from the pteg value */
+ if (base_pg_shift < 23) {
+ offset |= ((vsid ^ pteg) & old_hash_mask) << base_pg_shift;
+ }
+
+ hash = vsid ^ (offset >> base_pg_shift);
+ } else if ((pte0 & HPTE64_V_SSIZE) == HPTE64_V_SSIZE_1T) {
+ uint64_t offset, vsid;
+
+ /* We only have 40 - 23 bits of seg_off in avpn */
+ offset = (avpn & 0x1ffff) << 23;
+ vsid = avpn >> 17;
+ if (base_pg_shift < 23) {
+ offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask)
+ << base_pg_shift;
+ }
+
+ hash = vsid ^ (vsid << 25) ^ (offset >> base_pg_shift);
+ } else {
+ error_report("rehash_pte: Bad segment size in HPTE");
+ return H_HARDWARE;
+ }
+
+ new_pteg = hash & new_hash_mask;
+ if (pte0 & HPTE64_V_SECONDARY) {
+ assert(~pteg == (hash & old_hash_mask));
+ new_pteg = ~new_pteg;
+ } else {
+ assert(pteg == (hash & old_hash_mask));
+ }
+ assert((oldsize != newsize) || (pteg == new_pteg));
+ replace_pte0 = new_hpte_load0(new_hpt, new_pteg, slot);
+ /*
+ * Strictly speaking, we don't need all these tests, since we only
+ * ever rehash bolted HPTEs. We might in future handle non-bolted
+ * HPTEs, though so make the logic correct for those cases as
+ * well.
+ */
+ if (replace_pte0 & HPTE64_V_VALID) {
+ assert(newsize < oldsize);
+ if (replace_pte0 & HPTE64_V_BOLTED) {
+ if (pte0 & HPTE64_V_BOLTED) {
+ /* Bolted collision, nothing we can do */
+ return H_PTEG_FULL;
+ } else {
+ /* Discard this hpte */
+ return H_SUCCESS;
+ }
+ }
+ }
+
+ new_hpte_store(new_hpt, new_pteg, slot, pte0, pte1);
+ return H_SUCCESS;
+}
+
+static int rehash_hpt(PowerPCCPU *cpu,
+ void *old_hpt, uint64_t oldsize,
+ void *new_hpt, uint64_t newsize)
+{
+ uint64_t n_ptegs = oldsize >> 7;
+ uint64_t pteg;
+ int slot;
+ int rc;
+
+ for (pteg = 0; pteg < n_ptegs; pteg++) {
+ hwaddr ptex = pteg * HPTES_PER_GROUP;
+ const ppc_hash_pte64_t *hptes
+ = ppc_hash64_map_hptes(cpu, ptex, HPTES_PER_GROUP);
+
+ if (!hptes) {
+ return H_HARDWARE;
+ }
+
+ for (slot = 0; slot < HPTES_PER_GROUP; slot++) {
+ rc = rehash_hpte(cpu, hptes, old_hpt, oldsize, new_hpt, newsize,
+ pteg, slot);
+ if (rc != H_SUCCESS) {
+ ppc_hash64_unmap_hptes(cpu, hptes, ptex, HPTES_PER_GROUP);
+ return rc;
+ }
+ }
+ ppc_hash64_unmap_hptes(cpu, hptes, ptex, HPTES_PER_GROUP);
+ }
+
+ return H_SUCCESS;
+}
+
+static target_ulong h_resize_hpt_commit(PowerPCCPU *cpu,
+ sPAPRMachineState *spapr,
+ target_ulong opcode,
+ target_ulong *args)
+{
+ target_ulong flags = args[0];
+ target_ulong shift = args[1];
+ sPAPRPendingHPT *pending = spapr->pending_hpt;
+ int rc;
+ size_t newsize;
+
+ if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
+ return H_AUTHORITY;
+ }
+
+ trace_spapr_h_resize_hpt_commit(flags, shift);
+
+ rc = kvmppc_resize_hpt_commit(cpu, flags, shift);
+ if (rc != -ENOSYS) {
+ return resize_hpt_convert_rc(rc);
+ }
+
+ if (flags != 0) {
+ return H_PARAMETER;
+ }
+
+ if (!pending || (pending->shift != shift)) {
+ /* no matching prepare */
+ return H_CLOSED;
+ }
+
+ if (!pending->complete) {
+ /* prepare has not completed */
+ return H_BUSY;
+ }
+
+ /* Shouldn't have got past PREPARE without an HPT */
+ g_assert(spapr->htab_shift);
+
+ newsize = 1ULL << pending->shift;
+ rc = rehash_hpt(cpu, spapr->htab, HTAB_SIZE(spapr),
+ pending->hpt, newsize);
+ if (rc == H_SUCCESS) {
+ qemu_vfree(spapr->htab);
+ spapr->htab = pending->hpt;
+ spapr->htab_shift = pending->shift;
+
+ if (kvm_enabled()) {
+ /* For KVM PR, update the HPT pointer */
+ target_ulong sdr1 = (target_ulong)(uintptr_t)spapr->htab
+ | (spapr->htab_shift - 18);
+ kvmppc_update_sdr1(sdr1);
+ }
+
+ pending->hpt = NULL; /* so it's not free()d */
+ }
+
+ /* Clean up */
+ spapr->pending_hpt = NULL;
+ free_pending_hpt(pending);
+
+ return rc;
+}
+
static target_ulong h_set_sprg0(PowerPCCPU *cpu, sPAPRMachineState *spapr,
target_ulong opcode, target_ulong *args)
{
/* Update the UPRT and GTSE bits in the LPCR for all cpus */
CPU_FOREACH(cs) {
- set_spr(cs, SPR_LPCR, LPCR_UPRT | LPCR_GTSE,
+ set_spr(cs, SPR_LPCR,
((flags & (FLAG_RADIX | FLAG_HASH_PROC_TBL)) ? LPCR_UPRT : 0) |
- ((flags & FLAG_GTSE) ? LPCR_GTSE : 0));
+ ((flags & FLAG_GTSE) ? LPCR_GTSE : 0),
+ LPCR_UPRT | LPCR_GTSE);
}
if (kvm_enabled()) {
}
}
-static uint32_t cas_check_pvr(PowerPCCPU *cpu, target_ulong *addr,
- Error **errp)
+static uint32_t cas_check_pvr(sPAPRMachineState *spapr, PowerPCCPU *cpu,
+ target_ulong *addr, Error **errp)
{
bool explicit_match = false; /* Matched the CPU's real PVR */
- uint32_t max_compat = cpu->max_compat;
+ uint32_t max_compat = spapr->max_compat_pvr;
uint32_t best_compat = 0;
int i;
bool guest_radix;
Error *local_err = NULL;
- cas_pvr = cas_check_pvr(cpu, &addr, &local_err);
+ cas_pvr = cas_check_pvr(spapr, cpu, &addr, &local_err);
if (local_err) {
error_report_err(local_err);
return H_HARDWARE;
guest_radix = spapr_ovec_test(ov5_guest, OV5_MMU_RADIX_300);
spapr_ovec_clear(ov5_guest, OV5_MMU_RADIX_300);
+ /*
+ * HPT resizing is a bit of a special case, because when enabled
+ * we assume an HPT guest will support it until it says it
+ * doesn't, instead of assuming it won't support it until it says
+ * it does. Strictly speaking that approach could break for
+ * guests which don't make a CAS call, but those are so old we
+ * don't care about them. Without that assumption we'd have to
+ * make at least a temporary allocation of an HPT sized for max
+ * memory, which could be impossibly difficult under KVM HV if
+ * maxram is large.
+ */
+ if (!guest_radix && !spapr_ovec_test(ov5_guest, OV5_HPT_RESIZE)) {
+ int maxshift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size);
+
+ if (spapr->resize_hpt == SPAPR_RESIZE_HPT_REQUIRED) {
+ error_report(
+ "h_client_architecture_support: Guest doesn't support HPT resizing, but resize-hpt=required");
+ exit(1);
+ }
+
+ if (spapr->htab_shift < maxshift) {
+ CPUState *cs;
+
+ /* Guest doesn't know about HPT resizing, so we
+ * pre-emptively resize for the maximum permitted RAM. At
+ * the point this is called, nothing should have been
+ * entered into the existing HPT */
+ spapr_reallocate_hpt(spapr, maxshift, &error_fatal);
+ CPU_FOREACH(cs) {
+ if (kvm_enabled()) {
+ /* For KVM PR, update the HPT pointer */
+ target_ulong sdr1 = (target_ulong)(uintptr_t)spapr->htab
+ | (spapr->htab_shift - 18);
+ kvmppc_update_sdr1(sdr1);
+ }
+ }
+ }
+ }
+
/* NOTE: there are actually a number of ov5 bits where input from the
* guest is always zero, and the platform/QEMU enables them independently
* of guest input. To model these properly we'd want some sort of mask,
/* hcall-bulk */
spapr_register_hypercall(H_BULK_REMOVE, h_bulk_remove);
+ /* hcall-hpt-resize */
+ spapr_register_hypercall(H_RESIZE_HPT_PREPARE, h_resize_hpt_prepare);
+ spapr_register_hypercall(H_RESIZE_HPT_COMMIT, h_resize_hpt_commit);
+
/* hcall-splpar */
spapr_register_hypercall(H_REGISTER_VPA, h_register_vpa);
spapr_register_hypercall(H_CEDE, h_cede);