target/ppc/kvm.c

   1 /*
   2  * PowerPC implementation of KVM hooks
   3  *
   4  * Copyright IBM Corp. 2007
   5  * Copyright (C) 2011 Freescale Semiconductor, Inc.
   6  *
   7  * Authors:
   8  *  Jerone Young <[email protected]>
   9  *  Christian Ehrhardt <[email protected]>
  10  *  Hollis Blanchard <[email protected]>
  11  *
  12  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  13  * See the COPYING file in the top-level directory.
  14  *
  15  */
  16
  17 #include "qemu/osdep.h"
  18 #include <dirent.h>
  19 #include <sys/ioctl.h>
  20 #include <sys/vfs.h>
  21
  22 #include <linux/kvm.h>
  23
  24 #include "qemu-common.h"
  25 #include "qapi/error.h"
  26 #include "qemu/error-report.h"
  27 #include "cpu.h"
  28 #include "cpu-models.h"
  29 #include "qemu/timer.h"
  30 #include "sysemu/sysemu.h"
  31 #include "sysemu/hw_accel.h"
  32 #include "kvm_ppc.h"
  33 #include "sysemu/cpus.h"
  34 #include "sysemu/device_tree.h"
  35 #include "mmu-hash64.h"
  36
  37 #include "hw/sysbus.h"
  38 #include "hw/ppc/spapr.h"
  39 #include "hw/ppc/spapr_vio.h"
  40 #include "hw/ppc/spapr_cpu_core.h"
  41 #include "hw/ppc/ppc.h"
  42 #include "sysemu/watchdog.h"
  43 #include "trace.h"
  44 #include "exec/gdbstub.h"
  45 #include "exec/memattrs.h"
  46 #include "exec/ram_addr.h"
  47 #include "sysemu/hostmem.h"
  48 #include "qemu/cutils.h"
  49 #include "qemu/mmap-alloc.h"
  50 #include "elf.h"
  51 #include "sysemu/kvm_int.h"
  52
  53 //#define DEBUG_KVM
  54
  55 #ifdef DEBUG_KVM
  56 #define DPRINTF(fmt, ...) \
  57     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  58 #else
  59 #define DPRINTF(fmt, ...) \
  60     do { } while (0)
  61 #endif
  62
  63 #define PROC_DEVTREE_CPU      "/proc/device-tree/cpus/"
  64
  65 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
  66     KVM_CAP_LAST_INFO
  67 };
  68
  69 static int cap_interrupt_unset = false;
  70 static int cap_interrupt_level = false;
  71 static int cap_segstate;
  72 static int cap_booke_sregs;
  73 static int cap_ppc_smt;
  74 static int cap_ppc_smt_possible;
  75 static int cap_ppc_rma;
  76 static int cap_spapr_tce;
  77 static int cap_spapr_tce_64;
  78 static int cap_spapr_multitce;
  79 static int cap_spapr_vfio;
  80 static int cap_hior;
  81 static int cap_one_reg;
  82 static int cap_epr;
  83 static int cap_ppc_watchdog;
  84 static int cap_papr;
  85 static int cap_htab_fd;
  86 static int cap_fixup_hcalls;
  87 static int cap_htm;             /* Hardware transactional memory support */
  88 static int cap_mmu_radix;
  89 static int cap_mmu_hash_v3;
  90 static int cap_resize_hpt;
  91 static int cap_ppc_pvr_compat;
  92
  93 static uint32_t debug_inst_opcode;
  94
  95 /* XXX We have a race condition where we actually have a level triggered
  96  *     interrupt, but the infrastructure can't expose that yet, so the guest
  97  *     takes but ignores it, goes to sleep and never gets notified that there's
  98  *     still an interrupt pending.
  99  *
 100  *     As a quick workaround, let's just wake up again 20 ms after we injected
 101  *     an interrupt. That way we can assure that we're always reinjecting
 102  *     interrupts in case the guest swallowed them.
 103  */
 104 static QEMUTimer *idle_timer;
 105
 106 static void kvm_kick_cpu(void *opaque)
 107 {
 108     PowerPCCPU *cpu = opaque;
 109
 110     qemu_cpu_kick(CPU(cpu));
 111 }
 112
 113 /* Check whether we are running with KVM-PR (instead of KVM-HV).  This
 114  * should only be used for fallback tests - generally we should use
 115  * explicit capabilities for the features we want, rather than
 116  * assuming what is/isn't available depending on the KVM variant. */
 117 static bool kvmppc_is_pr(KVMState *ks)
 118 {
 119     /* Assume KVM-PR if the GET_PVINFO capability is available */
 120     return kvm_vm_check_extension(ks, KVM_CAP_PPC_GET_PVINFO) != 0;
 121 }
 122
 123 static int kvm_ppc_register_host_cpu_type(MachineState *ms);
 124
 125 int kvm_arch_init(MachineState *ms, KVMState *s)
 126 {
 127     cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
 128     cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
 129     cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
 130     cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
 131     cap_ppc_smt_possible = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT_POSSIBLE);
 132     cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
 133     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
 134     cap_spapr_tce_64 = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_64);
 135     cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
 136     cap_spapr_vfio = false;
 137     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
 138     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
 139     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
 140     cap_ppc_watchdog = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_WATCHDOG);
 141     /* Note: we don't set cap_papr here, because this capability is
 142      * only activated after this by kvmppc_set_papr() */
 143     cap_htab_fd = kvm_vm_check_extension(s, KVM_CAP_PPC_HTAB_FD);
 144     cap_fixup_hcalls = kvm_check_extension(s, KVM_CAP_PPC_FIXUP_HCALL);
 145     cap_ppc_smt = kvm_vm_check_extension(s, KVM_CAP_PPC_SMT);
 146     cap_htm = kvm_vm_check_extension(s, KVM_CAP_PPC_HTM);
 147     cap_mmu_radix = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_RADIX);
 148     cap_mmu_hash_v3 = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_HASH_V3);
 149     cap_resize_hpt = kvm_vm_check_extension(s, KVM_CAP_SPAPR_RESIZE_HPT);
 150     /*
 151      * Note: setting it to false because there is not such capability
 152      * in KVM at this moment.
 153      *
 154      * TODO: call kvm_vm_check_extension() with the right capability
 155      * after the kernel starts implementing it.*/
 156     cap_ppc_pvr_compat = false;
 157
 158     if (!cap_interrupt_level) {
 159         fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
 160                         "VM to stall at times!\n");
 161     }
 162
 163     kvm_ppc_register_host_cpu_type(ms);
 164
 165     return 0;
 166 }
 167
 168 int kvm_arch_irqchip_create(MachineState *ms, KVMState *s)
 169 {
 170     return 0;
 171 }
 172
 173 static int kvm_arch_sync_sregs(PowerPCCPU *cpu)
 174 {
 175     CPUPPCState *cenv = &cpu->env;
 176     CPUState *cs = CPU(cpu);
 177     struct kvm_sregs sregs;
 178     int ret;
 179
 180     if (cenv->excp_model == POWERPC_EXCP_BOOKE) {
 181         /* What we're really trying to say is "if we're on BookE, we use
 182            the native PVR for now". This is the only sane way to check
 183            it though, so we potentially confuse users that they can run
 184            BookE guests on BookS. Let's hope nobody dares enough :) */
 185         return 0;
 186     } else {
 187         if (!cap_segstate) {
 188             fprintf(stderr, "kvm error: missing PVR setting capability\n");
 189             return -ENOSYS;
 190         }
 191     }
 192
 193     ret = kvm_vcpu_ioctl(cs, KVM_GET_SREGS, &sregs);
 194     if (ret) {
 195         return ret;
 196     }
 197
 198     sregs.pvr = cenv->spr[SPR_PVR];
 199     return kvm_vcpu_ioctl(cs, KVM_SET_SREGS, &sregs);
 200 }
 201
 202 /* Set up a shared TLB array with KVM */
 203 static int kvm_booke206_tlb_init(PowerPCCPU *cpu)
 204 {
 205     CPUPPCState *env = &cpu->env;
 206     CPUState *cs = CPU(cpu);
 207     struct kvm_book3e_206_tlb_params params = {};
 208     struct kvm_config_tlb cfg = {};
 209     unsigned int entries = 0;
 210     int ret, i;
 211
 212     if (!kvm_enabled() ||
 213         !kvm_check_extension(cs->kvm_state, KVM_CAP_SW_TLB)) {
 214         return 0;
 215     }
 216
 217     assert(ARRAY_SIZE(params.tlb_sizes) == BOOKE206_MAX_TLBN);
 218
 219     for (i = 0; i < BOOKE206_MAX_TLBN; i++) {
 220         params.tlb_sizes[i] = booke206_tlb_size(env, i);
 221         params.tlb_ways[i] = booke206_tlb_ways(env, i);
 222         entries += params.tlb_sizes[i];
 223     }
 224
 225     assert(entries == env->nb_tlb);
 226     assert(sizeof(struct kvm_book3e_206_tlb_entry) == sizeof(ppcmas_tlb_t));
 227
 228     env->tlb_dirty = true;
 229
 230     cfg.array = (uintptr_t)env->tlb.tlbm;
 231     cfg.array_len = sizeof(ppcmas_tlb_t) * entries;
 232     cfg.params = (uintptr_t)&params;
 233     cfg.mmu_type = KVM_MMU_FSL_BOOKE_NOHV;
 234
 235     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_SW_TLB, 0, (uintptr_t)&cfg);
 236     if (ret < 0) {
 237         fprintf(stderr, "%s: couldn't enable KVM_CAP_SW_TLB: %s\n",
 238                 __func__, strerror(-ret));
 239         return ret;
 240     }
 241
 242     env->kvm_sw_tlb = true;
 243     return 0;
 244 }
 245
 246
 247 #if defined(TARGET_PPC64)
 248 static void kvm_get_fallback_smmu_info(PowerPCCPU *cpu,
 249                                        struct kvm_ppc_smmu_info *info)
 250 {
 251     CPUPPCState *env = &cpu->env;
 252     CPUState *cs = CPU(cpu);
 253
 254     memset(info, 0, sizeof(*info));
 255
 256     /* We don't have the new KVM_PPC_GET_SMMU_INFO ioctl, so
 257      * need to "guess" what the supported page sizes are.
 258      *
 259      * For that to work we make a few assumptions:
 260      *
 261      * - Check whether we are running "PR" KVM which only supports 4K
 262      *   and 16M pages, but supports them regardless of the backing
 263      *   store characteritics. We also don't support 1T segments.
 264      *
 265      *   This is safe as if HV KVM ever supports that capability or PR
 266      *   KVM grows supports for more page/segment sizes, those versions
 267      *   will have implemented KVM_CAP_PPC_GET_SMMU_INFO and thus we
 268      *   will not hit this fallback
 269      *
 270      * - Else we are running HV KVM. This means we only support page
 271      *   sizes that fit in the backing store. Additionally we only
 272      *   advertize 64K pages if the processor is ARCH 2.06 and we assume
 273      *   P7 encodings for the SLB and hash table. Here too, we assume
 274      *   support for any newer processor will mean a kernel that
 275      *   implements KVM_CAP_PPC_GET_SMMU_INFO and thus doesn't hit
 276      *   this fallback.
 277      */
 278     if (kvmppc_is_pr(cs->kvm_state)) {
 279         /* No flags */
 280         info->flags = 0;
 281         info->slb_size = 64;
 282
 283         /* Standard 4k base page size segment */
 284         info->sps[0].page_shift = 12;
 285         info->sps[0].slb_enc = 0;
 286         info->sps[0].enc[0].page_shift = 12;
 287         info->sps[0].enc[0].pte_enc = 0;
 288
 289         /* Standard 16M large page size segment */
 290         info->sps[1].page_shift = 24;
 291         info->sps[1].slb_enc = SLB_VSID_L;
 292         info->sps[1].enc[0].page_shift = 24;
 293         info->sps[1].enc[0].pte_enc = 0;
 294     } else {
 295         int i = 0;
 296
 297         /* HV KVM has backing store size restrictions */
 298         info->flags = KVM_PPC_PAGE_SIZES_REAL;
 299
 300         if (env->mmu_model & POWERPC_MMU_1TSEG) {
 301             info->flags |= KVM_PPC_1T_SEGMENTS;
 302         }
 303
 304         if (POWERPC_MMU_VER(env->mmu_model) == POWERPC_MMU_VER_2_06 ||
 305            POWERPC_MMU_VER(env->mmu_model) == POWERPC_MMU_VER_2_07) {
 306             info->slb_size = 32;
 307         } else {
 308             info->slb_size = 64;
 309         }
 310
 311         /* Standard 4k base page size segment */
 312         info->sps[i].page_shift = 12;
 313         info->sps[i].slb_enc = 0;
 314         info->sps[i].enc[0].page_shift = 12;
 315         info->sps[i].enc[0].pte_enc = 0;
 316         i++;
 317
 318         /* 64K on MMU 2.06 and later */
 319         if (POWERPC_MMU_VER(env->mmu_model) == POWERPC_MMU_VER_2_06 ||
 320             POWERPC_MMU_VER(env->mmu_model) == POWERPC_MMU_VER_2_07) {
 321             info->sps[i].page_shift = 16;
 322             info->sps[i].slb_enc = 0x110;
 323             info->sps[i].enc[0].page_shift = 16;
 324             info->sps[i].enc[0].pte_enc = 1;
 325             i++;
 326         }
 327
 328         /* Standard 16M large page size segment */
 329         info->sps[i].page_shift = 24;
 330         info->sps[i].slb_enc = SLB_VSID_L;
 331         info->sps[i].enc[0].page_shift = 24;
 332         info->sps[i].enc[0].pte_enc = 0;
 333     }
 334 }
 335
 336 static void kvm_get_smmu_info(PowerPCCPU *cpu, struct kvm_ppc_smmu_info *info)
 337 {
 338     CPUState *cs = CPU(cpu);
 339     int ret;
 340
 341     if (kvm_check_extension(cs->kvm_state, KVM_CAP_PPC_GET_SMMU_INFO)) {
 342         ret = kvm_vm_ioctl(cs->kvm_state, KVM_PPC_GET_SMMU_INFO, info);
 343         if (ret == 0) {
 344             return;
 345         }
 346     }
 347
 348     kvm_get_fallback_smmu_info(cpu, info);
 349 }
 350
 351 struct ppc_radix_page_info *kvm_get_radix_page_info(void)
 352 {
 353     KVMState *s = KVM_STATE(current_machine->accelerator);
 354     struct ppc_radix_page_info *radix_page_info;
 355     struct kvm_ppc_rmmu_info rmmu_info;
 356     int i;
 357
 358     if (!kvm_check_extension(s, KVM_CAP_PPC_MMU_RADIX)) {
 359         return NULL;
 360     }
 361     if (kvm_vm_ioctl(s, KVM_PPC_GET_RMMU_INFO, &rmmu_info)) {
 362         return NULL;
 363     }
 364     radix_page_info = g_malloc0(sizeof(*radix_page_info));
 365     radix_page_info->count = 0;
 366     for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
 367         if (rmmu_info.ap_encodings[i]) {
 368             radix_page_info->entries[i] = rmmu_info.ap_encodings[i];
 369             radix_page_info->count++;
 370         }
 371     }
 372     return radix_page_info;
 373 }
 374
 375 target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu,
 376                                      bool radix, bool gtse,
 377                                      uint64_t proc_tbl)
 378 {
 379     CPUState *cs = CPU(cpu);
 380     int ret;
 381     uint64_t flags = 0;
 382     struct kvm_ppc_mmuv3_cfg cfg = {
 383         .process_table = proc_tbl,
 384     };
 385
 386     if (radix) {
 387         flags |= KVM_PPC_MMUV3_RADIX;
 388     }
 389     if (gtse) {
 390         flags |= KVM_PPC_MMUV3_GTSE;
 391     }
 392     cfg.flags = flags;
 393     ret = kvm_vm_ioctl(cs->kvm_state, KVM_PPC_CONFIGURE_V3_MMU, &cfg);
 394     switch (ret) {
 395     case 0:
 396         return H_SUCCESS;
 397     case -EINVAL:
 398         return H_PARAMETER;
 399     case -ENODEV:
 400         return H_NOT_AVAILABLE;
 401     default:
 402         return H_HARDWARE;
 403     }
 404 }
 405
 406 static bool kvm_valid_page_size(uint32_t flags, long rampgsize, uint32_t shift)
 407 {
 408     if (!(flags & KVM_PPC_PAGE_SIZES_REAL)) {
 409         return true;
 410     }
 411
 412     return (1ul << shift) <= rampgsize;
 413 }
 414
 415 static long max_cpu_page_size;
 416
 417 static void kvm_fixup_page_sizes(PowerPCCPU *cpu)
 418 {
 419     static struct kvm_ppc_smmu_info smmu_info;
 420     static bool has_smmu_info;
 421     CPUPPCState *env = &cpu->env;
 422     int iq, ik, jq, jk;
 423     bool has_64k_pages = false;
 424
 425     /* We only handle page sizes for 64-bit server guests for now */
 426     if (!(env->mmu_model & POWERPC_MMU_64)) {
 427         return;
 428     }
 429
 430     /* Collect MMU info from kernel if not already */
 431     if (!has_smmu_info) {
 432         kvm_get_smmu_info(cpu, &smmu_info);
 433         has_smmu_info = true;
 434     }
 435
 436     if (!max_cpu_page_size) {
 437         max_cpu_page_size = qemu_getrampagesize();
 438     }
 439
 440     /* Convert to QEMU form */
 441     memset(&env->sps, 0, sizeof(env->sps));
 442
 443     /* If we have HV KVM, we need to forbid CI large pages if our
 444      * host page size is smaller than 64K.
 445      */
 446     if (smmu_info.flags & KVM_PPC_PAGE_SIZES_REAL) {
 447         env->ci_large_pages = getpagesize() >= 0x10000;
 448     }
 449
 450     /*
 451      * XXX This loop should be an entry wide AND of the capabilities that
 452      *     the selected CPU has with the capabilities that KVM supports.
 453      */
 454     for (ik = iq = 0; ik < KVM_PPC_PAGE_SIZES_MAX_SZ; ik++) {
 455         struct ppc_one_seg_page_size *qsps = &env->sps.sps[iq];
 456         struct kvm_ppc_one_seg_page_size *ksps = &smmu_info.sps[ik];
 457
 458         if (!kvm_valid_page_size(smmu_info.flags, max_cpu_page_size,
 459                                  ksps->page_shift)) {
 460             continue;
 461         }
 462         qsps->page_shift = ksps->page_shift;
 463         qsps->slb_enc = ksps->slb_enc;
 464         for (jk = jq = 0; jk < KVM_PPC_PAGE_SIZES_MAX_SZ; jk++) {
 465             if (!kvm_valid_page_size(smmu_info.flags, max_cpu_page_size,
 466                                      ksps->enc[jk].page_shift)) {
 467                 continue;
 468             }
 469             if (ksps->enc[jk].page_shift == 16) {
 470                 has_64k_pages = true;
 471             }
 472             qsps->enc[jq].page_shift = ksps->enc[jk].page_shift;
 473             qsps->enc[jq].pte_enc = ksps->enc[jk].pte_enc;
 474             if (++jq >= PPC_PAGE_SIZES_MAX_SZ) {
 475                 break;
 476             }
 477         }
 478         if (++iq >= PPC_PAGE_SIZES_MAX_SZ) {
 479             break;
 480         }
 481     }
 482     env->slb_nr = smmu_info.slb_size;
 483     if (!(smmu_info.flags & KVM_PPC_1T_SEGMENTS)) {
 484         env->mmu_model &= ~POWERPC_MMU_1TSEG;
 485     }
 486     if (!has_64k_pages) {
 487         env->mmu_model &= ~POWERPC_MMU_64K;
 488     }
 489 }
 490
 491 bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path)
 492 {
 493     Object *mem_obj = object_resolve_path(obj_path, NULL);
 494     char *mempath = object_property_get_str(mem_obj, "mem-path", NULL);
 495     long pagesize;
 496
 497     if (mempath) {
 498         pagesize = qemu_mempath_getpagesize(mempath);
 499         g_free(mempath);
 500     } else {
 501         pagesize = getpagesize();
 502     }
 503
 504     return pagesize >= max_cpu_page_size;
 505 }
 506
 507 #else /* defined (TARGET_PPC64) */
 508
 509 static inline void kvm_fixup_page_sizes(PowerPCCPU *cpu)
 510 {
 511 }
 512
 513 bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path)
 514 {
 515     return true;
 516 }
 517
 518 #endif /* !defined (TARGET_PPC64) */
 519
 520 unsigned long kvm_arch_vcpu_id(CPUState *cpu)
 521 {
 522     return POWERPC_CPU(cpu)->vcpu_id;
 523 }
 524
 525 /* e500 supports 2 h/w breakpoint and 2 watchpoint.
 526  * book3s supports only 1 watchpoint, so array size
 527  * of 4 is sufficient for now.
 528  */
 529 #define MAX_HW_BKPTS 4
 530
 531 static struct HWBreakpoint {
 532     target_ulong addr;
 533     int type;
 534 } hw_debug_points[MAX_HW_BKPTS];
 535
 536 static CPUWatchpoint hw_watchpoint;
 537
 538 /* Default there is no breakpoint and watchpoint supported */
 539 static int max_hw_breakpoint;
 540 static int max_hw_watchpoint;
 541 static int nb_hw_breakpoint;
 542 static int nb_hw_watchpoint;
 543
 544 static void kvmppc_hw_debug_points_init(CPUPPCState *cenv)
 545 {
 546     if (cenv->excp_model == POWERPC_EXCP_BOOKE) {
 547         max_hw_breakpoint = 2;
 548         max_hw_watchpoint = 2;
 549     }
 550
 551     if ((max_hw_breakpoint + max_hw_watchpoint) > MAX_HW_BKPTS) {
 552         fprintf(stderr, "Error initializing h/w breakpoints\n");
 553         return;
 554     }
 555 }
 556
 557 int kvm_arch_init_vcpu(CPUState *cs)
 558 {
 559     PowerPCCPU *cpu = POWERPC_CPU(cs);
 560     CPUPPCState *cenv = &cpu->env;
 561     int ret;
 562
 563     /* Gather server mmu info from KVM and update the CPU state */
 564     kvm_fixup_page_sizes(cpu);
 565
 566     /* Synchronize sregs with kvm */
 567     ret = kvm_arch_sync_sregs(cpu);
 568     if (ret) {
 569         if (ret == -EINVAL) {
 570             error_report("Register sync failed... If you're using kvm-hv.ko,"
 571                          " only \"-cpu host\" is possible");
 572         }
 573         return ret;
 574     }
 575
 576     idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
 577
 578     switch (cenv->mmu_model) {
 579     case POWERPC_MMU_BOOKE206:
 580         /* This target supports access to KVM's guest TLB */
 581         ret = kvm_booke206_tlb_init(cpu);
 582         break;
 583     case POWERPC_MMU_2_07:
 584         if (!cap_htm && !kvmppc_is_pr(cs->kvm_state)) {
 585             /* KVM-HV has transactional memory on POWER8 also without the
 586              * KVM_CAP_PPC_HTM extension, so enable it here instead as
 587              * long as it's availble to userspace on the host. */
 588             if (qemu_getauxval(AT_HWCAP2) & PPC_FEATURE2_HAS_HTM) {
 589                 cap_htm = true;
 590             }
 591         }
 592         break;
 593     default:
 594         break;
 595     }
 596
 597     kvm_get_one_reg(cs, KVM_REG_PPC_DEBUG_INST, &debug_inst_opcode);
 598     kvmppc_hw_debug_points_init(cenv);
 599
 600     return ret;
 601 }
 602
 603 static void kvm_sw_tlb_put(PowerPCCPU *cpu)
 604 {
 605     CPUPPCState *env = &cpu->env;
 606     CPUState *cs = CPU(cpu);
 607     struct kvm_dirty_tlb dirty_tlb;
 608     unsigned char *bitmap;
 609     int ret;
 610
 611     if (!env->kvm_sw_tlb) {
 612         return;
 613     }
 614
 615     bitmap = g_malloc((env->nb_tlb + 7) / 8);
 616     memset(bitmap, 0xFF, (env->nb_tlb + 7) / 8);
 617
 618     dirty_tlb.bitmap = (uintptr_t)bitmap;
 619     dirty_tlb.num_dirty = env->nb_tlb;
 620
 621     ret = kvm_vcpu_ioctl(cs, KVM_DIRTY_TLB, &dirty_tlb);
 622     if (ret) {
 623         fprintf(stderr, "%s: KVM_DIRTY_TLB: %s\n",
 624                 __func__, strerror(-ret));
 625     }
 626
 627     g_free(bitmap);
 628 }
 629
 630 static void kvm_get_one_spr(CPUState *cs, uint64_t id, int spr)
 631 {
 632     PowerPCCPU *cpu = POWERPC_CPU(cs);
 633     CPUPPCState *env = &cpu->env;
 634     union {
 635         uint32_t u32;
 636         uint64_t u64;
 637     } val;
 638     struct kvm_one_reg reg = {
 639         .id = id,
 640         .addr = (uintptr_t) &val,
 641     };
 642     int ret;
 643
 644     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 645     if (ret != 0) {
 646         trace_kvm_failed_spr_get(spr, strerror(errno));
 647     } else {
 648         switch (id & KVM_REG_SIZE_MASK) {
 649         case KVM_REG_SIZE_U32:
 650             env->spr[spr] = val.u32;
 651             break;
 652
 653         case KVM_REG_SIZE_U64:
 654             env->spr[spr] = val.u64;
 655             break;
 656
 657         default:
 658             /* Don't handle this size yet */
 659             abort();
 660         }
 661     }
 662 }
 663
 664 static void kvm_put_one_spr(CPUState *cs, uint64_t id, int spr)
 665 {
 666     PowerPCCPU *cpu = POWERPC_CPU(cs);
 667     CPUPPCState *env = &cpu->env;
 668     union {
 669         uint32_t u32;
 670         uint64_t u64;
 671     } val;
 672     struct kvm_one_reg reg = {
 673         .id = id,
 674         .addr = (uintptr_t) &val,
 675     };
 676     int ret;
 677
 678     switch (id & KVM_REG_SIZE_MASK) {
 679     case KVM_REG_SIZE_U32:
 680         val.u32 = env->spr[spr];
 681         break;
 682
 683     case KVM_REG_SIZE_U64:
 684         val.u64 = env->spr[spr];
 685         break;
 686
 687     default:
 688         /* Don't handle this size yet */
 689         abort();
 690     }
 691
 692     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 693     if (ret != 0) {
 694         trace_kvm_failed_spr_set(spr, strerror(errno));
 695     }
 696 }
 697
 698 static int kvm_put_fp(CPUState *cs)
 699 {
 700     PowerPCCPU *cpu = POWERPC_CPU(cs);
 701     CPUPPCState *env = &cpu->env;
 702     struct kvm_one_reg reg;
 703     int i;
 704     int ret;
 705
 706     if (env->insns_flags & PPC_FLOAT) {
 707         uint64_t fpscr = env->fpscr;
 708         bool vsx = !!(env->insns_flags2 & PPC2_VSX);
 709
 710         reg.id = KVM_REG_PPC_FPSCR;
 711         reg.addr = (uintptr_t)&fpscr;
 712         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 713         if (ret < 0) {
 714             DPRINTF("Unable to set FPSCR to KVM: %s\n", strerror(errno));
 715             return ret;
 716         }
 717
 718         for (i = 0; i < 32; i++) {
 719             uint64_t vsr[2];
 720
 721 #ifdef HOST_WORDS_BIGENDIAN
 722             vsr[0] = float64_val(env->fpr[i]);
 723             vsr[1] = env->vsr[i];
 724 #else
 725             vsr[0] = env->vsr[i];
 726             vsr[1] = float64_val(env->fpr[i]);
 727 #endif
 728             reg.addr = (uintptr_t) &vsr;
 729             reg.id = vsx ? KVM_REG_PPC_VSR(i) : KVM_REG_PPC_FPR(i);
 730
 731             ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 732             if (ret < 0) {
 733                 DPRINTF("Unable to set %s%d to KVM: %s\n", vsx ? "VSR" : "FPR",
 734                         i, strerror(errno));
 735                 return ret;
 736             }
 737         }
 738     }
 739
 740     if (env->insns_flags & PPC_ALTIVEC) {
 741         reg.id = KVM_REG_PPC_VSCR;
 742         reg.addr = (uintptr_t)&env->vscr;
 743         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 744         if (ret < 0) {
 745             DPRINTF("Unable to set VSCR to KVM: %s\n", strerror(errno));
 746             return ret;
 747         }
 748
 749         for (i = 0; i < 32; i++) {
 750             reg.id = KVM_REG_PPC_VR(i);
 751             reg.addr = (uintptr_t)&env->avr[i];
 752             ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 753             if (ret < 0) {
 754                 DPRINTF("Unable to set VR%d to KVM: %s\n", i, strerror(errno));
 755                 return ret;
 756             }
 757         }
 758     }
 759
 760     return 0;
 761 }
 762
 763 static int kvm_get_fp(CPUState *cs)
 764 {
 765     PowerPCCPU *cpu = POWERPC_CPU(cs);
 766     CPUPPCState *env = &cpu->env;
 767     struct kvm_one_reg reg;
 768     int i;
 769     int ret;
 770
 771     if (env->insns_flags & PPC_FLOAT) {
 772         uint64_t fpscr;
 773         bool vsx = !!(env->insns_flags2 & PPC2_VSX);
 774
 775         reg.id = KVM_REG_PPC_FPSCR;
 776         reg.addr = (uintptr_t)&fpscr;
 777         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 778         if (ret < 0) {
 779             DPRINTF("Unable to get FPSCR from KVM: %s\n", strerror(errno));
 780             return ret;
 781         } else {
 782             env->fpscr = fpscr;
 783         }
 784
 785         for (i = 0; i < 32; i++) {
 786             uint64_t vsr[2];
 787
 788             reg.addr = (uintptr_t) &vsr;
 789             reg.id = vsx ? KVM_REG_PPC_VSR(i) : KVM_REG_PPC_FPR(i);
 790
 791             ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 792             if (ret < 0) {
 793                 DPRINTF("Unable to get %s%d from KVM: %s\n",
 794                         vsx ? "VSR" : "FPR", i, strerror(errno));
 795                 return ret;
 796             } else {
 797 #ifdef HOST_WORDS_BIGENDIAN
 798                 env->fpr[i] = vsr[0];
 799                 if (vsx) {
 800                     env->vsr[i] = vsr[1];
 801                 }
 802 #else
 803                 env->fpr[i] = vsr[1];
 804                 if (vsx) {
 805                     env->vsr[i] = vsr[0];
 806                 }
 807 #endif
 808             }
 809         }
 810     }
 811
 812     if (env->insns_flags & PPC_ALTIVEC) {
 813         reg.id = KVM_REG_PPC_VSCR;
 814         reg.addr = (uintptr_t)&env->vscr;
 815         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 816         if (ret < 0) {
 817             DPRINTF("Unable to get VSCR from KVM: %s\n", strerror(errno));
 818             return ret;
 819         }
 820
 821         for (i = 0; i < 32; i++) {
 822             reg.id = KVM_REG_PPC_VR(i);
 823             reg.addr = (uintptr_t)&env->avr[i];
 824             ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 825             if (ret < 0) {
 826                 DPRINTF("Unable to get VR%d from KVM: %s\n",
 827                         i, strerror(errno));
 828                 return ret;
 829             }
 830         }
 831     }
 832
 833     return 0;
 834 }
 835
 836 #if defined(TARGET_PPC64)
 837 static int kvm_get_vpa(CPUState *cs)
 838 {
 839     PowerPCCPU *cpu = POWERPC_CPU(cs);
 840     CPUPPCState *env = &cpu->env;
 841     struct kvm_one_reg reg;
 842     int ret;
 843
 844     reg.id = KVM_REG_PPC_VPA_ADDR;
 845     reg.addr = (uintptr_t)&env->vpa_addr;
 846     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 847     if (ret < 0) {
 848         DPRINTF("Unable to get VPA address from KVM: %s\n", strerror(errno));
 849         return ret;
 850     }
 851
 852     assert((uintptr_t)&env->slb_shadow_size
 853            == ((uintptr_t)&env->slb_shadow_addr + 8));
 854     reg.id = KVM_REG_PPC_VPA_SLB;
 855     reg.addr = (uintptr_t)&env->slb_shadow_addr;
 856     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 857     if (ret < 0) {
 858         DPRINTF("Unable to get SLB shadow state from KVM: %s\n",
 859                 strerror(errno));
 860         return ret;
 861     }
 862
 863     assert((uintptr_t)&env->dtl_size == ((uintptr_t)&env->dtl_addr + 8));
 864     reg.id = KVM_REG_PPC_VPA_DTL;
 865     reg.addr = (uintptr_t)&env->dtl_addr;
 866     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 867     if (ret < 0) {
 868         DPRINTF("Unable to get dispatch trace log state from KVM: %s\n",
 869                 strerror(errno));
 870         return ret;
 871     }
 872
 873     return 0;
 874 }
 875
 876 static int kvm_put_vpa(CPUState *cs)
 877 {
 878     PowerPCCPU *cpu = POWERPC_CPU(cs);
 879     CPUPPCState *env = &cpu->env;
 880     struct kvm_one_reg reg;
 881     int ret;
 882
 883     /* SLB shadow or DTL can't be registered unless a master VPA is
 884      * registered.  That means when restoring state, if a VPA *is*
 885      * registered, we need to set that up first.  If not, we need to
 886      * deregister the others before deregistering the master VPA */
 887     assert(env->vpa_addr || !(env->slb_shadow_addr || env->dtl_addr));
 888
 889     if (env->vpa_addr) {
 890         reg.id = KVM_REG_PPC_VPA_ADDR;
 891         reg.addr = (uintptr_t)&env->vpa_addr;
 892         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 893         if (ret < 0) {
 894             DPRINTF("Unable to set VPA address to KVM: %s\n", strerror(errno));
 895             return ret;
 896         }
 897     }
 898
 899     assert((uintptr_t)&env->slb_shadow_size
 900            == ((uintptr_t)&env->slb_shadow_addr + 8));
 901     reg.id = KVM_REG_PPC_VPA_SLB;
 902     reg.addr = (uintptr_t)&env->slb_shadow_addr;
 903     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 904     if (ret < 0) {
 905         DPRINTF("Unable to set SLB shadow state to KVM: %s\n", strerror(errno));
 906         return ret;
 907     }
 908
 909     assert((uintptr_t)&env->dtl_size == ((uintptr_t)&env->dtl_addr + 8));
 910     reg.id = KVM_REG_PPC_VPA_DTL;
 911     reg.addr = (uintptr_t)&env->dtl_addr;
 912     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 913     if (ret < 0) {
 914         DPRINTF("Unable to set dispatch trace log state to KVM: %s\n",
 915                 strerror(errno));
 916         return ret;
 917     }
 918
 919     if (!env->vpa_addr) {
 920         reg.id = KVM_REG_PPC_VPA_ADDR;
 921         reg.addr = (uintptr_t)&env->vpa_addr;
 922         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 923         if (ret < 0) {
 924             DPRINTF("Unable to set VPA address to KVM: %s\n", strerror(errno));
 925             return ret;
 926         }
 927     }
 928
 929     return 0;
 930 }
 931 #endif /* TARGET_PPC64 */
 932
 933 int kvmppc_put_books_sregs(PowerPCCPU *cpu)
 934 {
 935     CPUPPCState *env = &cpu->env;
 936     struct kvm_sregs sregs;
 937     int i;
 938
 939     sregs.pvr = env->spr[SPR_PVR];
 940
 941     if (cpu->vhyp) {
 942         PPCVirtualHypervisorClass *vhc =
 943             PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
 944         sregs.u.s.sdr1 = vhc->encode_hpt_for_kvm_pr(cpu->vhyp);
 945     } else {
 946         sregs.u.s.sdr1 = env->spr[SPR_SDR1];
 947     }
 948
 949     /* Sync SLB */
 950 #ifdef TARGET_PPC64
 951     for (i = 0; i < ARRAY_SIZE(env->slb); i++) {
 952         sregs.u.s.ppc64.slb[i].slbe = env->slb[i].esid;
 953         if (env->slb[i].esid & SLB_ESID_V) {
 954             sregs.u.s.ppc64.slb[i].slbe |= i;
 955         }
 956         sregs.u.s.ppc64.slb[i].slbv = env->slb[i].vsid;
 957     }
 958 #endif
 959
 960     /* Sync SRs */
 961     for (i = 0; i < 16; i++) {
 962         sregs.u.s.ppc32.sr[i] = env->sr[i];
 963     }
 964
 965     /* Sync BATs */
 966     for (i = 0; i < 8; i++) {
 967         /* Beware. We have to swap upper and lower bits here */
 968         sregs.u.s.ppc32.dbat[i] = ((uint64_t)env->DBAT[0][i] << 32)
 969             | env->DBAT[1][i];
 970         sregs.u.s.ppc32.ibat[i] = ((uint64_t)env->IBAT[0][i] << 32)
 971             | env->IBAT[1][i];
 972     }
 973
 974     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
 975 }
 976
 977 int kvm_arch_put_registers(CPUState *cs, int level)
 978 {
 979     PowerPCCPU *cpu = POWERPC_CPU(cs);
 980     CPUPPCState *env = &cpu->env;
 981     struct kvm_regs regs;
 982     int ret;
 983     int i;
 984
 985     ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, &regs);
 986     if (ret < 0) {
 987         return ret;
 988     }
 989
 990     regs.ctr = env->ctr;
 991     regs.lr  = env->lr;
 992     regs.xer = cpu_read_xer(env);
 993     regs.msr = env->msr;
 994     regs.pc = env->nip;
 995
 996     regs.srr0 = env->spr[SPR_SRR0];
 997     regs.srr1 = env->spr[SPR_SRR1];
 998
 999     regs.sprg0 = env->spr[SPR_SPRG0];
1000     regs.sprg1 = env->spr[SPR_SPRG1];
1001     regs.sprg2 = env->spr[SPR_SPRG2];
1002     regs.sprg3 = env->spr[SPR_SPRG3];
1003     regs.sprg4 = env->spr[SPR_SPRG4];
1004     regs.sprg5 = env->spr[SPR_SPRG5];
1005     regs.sprg6 = env->spr[SPR_SPRG6];
1006     regs.sprg7 = env->spr[SPR_SPRG7];
1007
1008     regs.pid = env->spr[SPR_BOOKE_PID];
1009
1010     for (i = 0;i < 32; i++)
1011         regs.gpr[i] = env->gpr[i];
1012
1013     regs.cr = 0;
1014     for (i = 0; i < 8; i++) {
1015         regs.cr |= (env->crf[i] & 15) << (4 * (7 - i));
1016     }
1017
1018     ret = kvm_vcpu_ioctl(cs, KVM_SET_REGS, &regs);
1019     if (ret < 0)
1020         return ret;
1021
1022     kvm_put_fp(cs);
1023
1024     if (env->tlb_dirty) {
1025         kvm_sw_tlb_put(cpu);
1026         env->tlb_dirty = false;
1027     }
1028
1029     if (cap_segstate && (level >= KVM_PUT_RESET_STATE)) {
1030         ret = kvmppc_put_books_sregs(cpu);
1031         if (ret < 0) {
1032             return ret;
1033         }
1034     }
1035
1036     if (cap_hior && (level >= KVM_PUT_RESET_STATE)) {
1037         kvm_put_one_spr(cs, KVM_REG_PPC_HIOR, SPR_HIOR);
1038     }
1039
1040     if (cap_one_reg) {
1041         int i;
1042
1043         /* We deliberately ignore errors here, for kernels which have
1044          * the ONE_REG calls, but don't support the specific
1045          * registers, there's a reasonable chance things will still
1046          * work, at least until we try to migrate. */
1047         for (i = 0; i < 1024; i++) {
1048             uint64_t id = env->spr_cb[i].one_reg_id;
1049
1050             if (id != 0) {
1051                 kvm_put_one_spr(cs, id, i);
1052             }
1053         }
1054
1055 #ifdef TARGET_PPC64
1056         if (msr_ts) {
1057             for (i = 0; i < ARRAY_SIZE(env->tm_gpr); i++) {
1058                 kvm_set_one_reg(cs, KVM_REG_PPC_TM_GPR(i), &env->tm_gpr[i]);
1059             }
1060             for (i = 0; i < ARRAY_SIZE(env->tm_vsr); i++) {
1061                 kvm_set_one_reg(cs, KVM_REG_PPC_TM_VSR(i), &env->tm_vsr[i]);
1062             }
1063             kvm_set_one_reg(cs, KVM_REG_PPC_TM_CR, &env->tm_cr);
1064             kvm_set_one_reg(cs, KVM_REG_PPC_TM_LR, &env->tm_lr);
1065             kvm_set_one_reg(cs, KVM_REG_PPC_TM_CTR, &env->tm_ctr);
1066             kvm_set_one_reg(cs, KVM_REG_PPC_TM_FPSCR, &env->tm_fpscr);
1067             kvm_set_one_reg(cs, KVM_REG_PPC_TM_AMR, &env->tm_amr);
1068             kvm_set_one_reg(cs, KVM_REG_PPC_TM_PPR, &env->tm_ppr);
1069             kvm_set_one_reg(cs, KVM_REG_PPC_TM_VRSAVE, &env->tm_vrsave);
1070             kvm_set_one_reg(cs, KVM_REG_PPC_TM_VSCR, &env->tm_vscr);
1071             kvm_set_one_reg(cs, KVM_REG_PPC_TM_DSCR, &env->tm_dscr);
1072             kvm_set_one_reg(cs, KVM_REG_PPC_TM_TAR, &env->tm_tar);
1073         }
1074
1075         if (cap_papr) {
1076             if (kvm_put_vpa(cs) < 0) {
1077                 DPRINTF("Warning: Unable to set VPA information to KVM\n");
1078             }
1079         }
1080
1081         kvm_set_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &env->tb_env->tb_offset);
1082 #endif /* TARGET_PPC64 */
1083     }
1084
1085     return ret;
1086 }
1087
1088 static void kvm_sync_excp(CPUPPCState *env, int vector, int ivor)
1089 {
1090      env->excp_vectors[vector] = env->spr[ivor] + env->spr[SPR_BOOKE_IVPR];
1091 }
1092
1093 static int kvmppc_get_booke_sregs(PowerPCCPU *cpu)
1094 {
1095     CPUPPCState *env = &cpu->env;
1096     struct kvm_sregs sregs;
1097     int ret;
1098
1099     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
1100     if (ret < 0) {
1101         return ret;
1102     }
1103
1104     if (sregs.u.e.features & KVM_SREGS_E_BASE) {
1105         env->spr[SPR_BOOKE_CSRR0] = sregs.u.e.csrr0;
1106         env->spr[SPR_BOOKE_CSRR1] = sregs.u.e.csrr1;
1107         env->spr[SPR_BOOKE_ESR] = sregs.u.e.esr;
1108         env->spr[SPR_BOOKE_DEAR] = sregs.u.e.dear;
1109         env->spr[SPR_BOOKE_MCSR] = sregs.u.e.mcsr;
1110         env->spr[SPR_BOOKE_TSR] = sregs.u.e.tsr;
1111         env->spr[SPR_BOOKE_TCR] = sregs.u.e.tcr;
1112         env->spr[SPR_DECR] = sregs.u.e.dec;
1113         env->spr[SPR_TBL] = sregs.u.e.tb & 0xffffffff;
1114         env->spr[SPR_TBU] = sregs.u.e.tb >> 32;
1115         env->spr[SPR_VRSAVE] = sregs.u.e.vrsave;
1116     }
1117
1118     if (sregs.u.e.features & KVM_SREGS_E_ARCH206) {
1119         env->spr[SPR_BOOKE_PIR] = sregs.u.e.pir;
1120         env->spr[SPR_BOOKE_MCSRR0] = sregs.u.e.mcsrr0;
1121         env->spr[SPR_BOOKE_MCSRR1] = sregs.u.e.mcsrr1;
1122         env->spr[SPR_BOOKE_DECAR] = sregs.u.e.decar;
1123         env->spr[SPR_BOOKE_IVPR] = sregs.u.e.ivpr;
1124     }
1125
1126     if (sregs.u.e.features & KVM_SREGS_E_64) {
1127         env->spr[SPR_BOOKE_EPCR] = sregs.u.e.epcr;
1128     }
1129
1130     if (sregs.u.e.features & KVM_SREGS_E_SPRG8) {
1131         env->spr[SPR_BOOKE_SPRG8] = sregs.u.e.sprg8;
1132     }
1133
1134     if (sregs.u.e.features & KVM_SREGS_E_IVOR) {
1135         env->spr[SPR_BOOKE_IVOR0] = sregs.u.e.ivor_low[0];
1136         kvm_sync_excp(env, POWERPC_EXCP_CRITICAL,  SPR_BOOKE_IVOR0);
1137         env->spr[SPR_BOOKE_IVOR1] = sregs.u.e.ivor_low[1];
1138         kvm_sync_excp(env, POWERPC_EXCP_MCHECK,  SPR_BOOKE_IVOR1);
1139         env->spr[SPR_BOOKE_IVOR2] = sregs.u.e.ivor_low[2];
1140         kvm_sync_excp(env, POWERPC_EXCP_DSI,  SPR_BOOKE_IVOR2);
1141         env->spr[SPR_BOOKE_IVOR3] = sregs.u.e.ivor_low[3];
1142         kvm_sync_excp(env, POWERPC_EXCP_ISI,  SPR_BOOKE_IVOR3);
1143         env->spr[SPR_BOOKE_IVOR4] = sregs.u.e.ivor_low[4];
1144         kvm_sync_excp(env, POWERPC_EXCP_EXTERNAL,  SPR_BOOKE_IVOR4);
1145         env->spr[SPR_BOOKE_IVOR5] = sregs.u.e.ivor_low[5];
1146         kvm_sync_excp(env, POWERPC_EXCP_ALIGN,  SPR_BOOKE_IVOR5);
1147         env->spr[SPR_BOOKE_IVOR6] = sregs.u.e.ivor_low[6];
1148         kvm_sync_excp(env, POWERPC_EXCP_PROGRAM,  SPR_BOOKE_IVOR6);
1149         env->spr[SPR_BOOKE_IVOR7] = sregs.u.e.ivor_low[7];
1150         kvm_sync_excp(env, POWERPC_EXCP_FPU,  SPR_BOOKE_IVOR7);
1151         env->spr[SPR_BOOKE_IVOR8] = sregs.u.e.ivor_low[8];
1152         kvm_sync_excp(env, POWERPC_EXCP_SYSCALL,  SPR_BOOKE_IVOR8);
1153         env->spr[SPR_BOOKE_IVOR9] = sregs.u.e.ivor_low[9];
1154         kvm_sync_excp(env, POWERPC_EXCP_APU,  SPR_BOOKE_IVOR9);
1155         env->spr[SPR_BOOKE_IVOR10] = sregs.u.e.ivor_low[10];
1156         kvm_sync_excp(env, POWERPC_EXCP_DECR,  SPR_BOOKE_IVOR10);
1157         env->spr[SPR_BOOKE_IVOR11] = sregs.u.e.ivor_low[11];
1158         kvm_sync_excp(env, POWERPC_EXCP_FIT,  SPR_BOOKE_IVOR11);
1159         env->spr[SPR_BOOKE_IVOR12] = sregs.u.e.ivor_low[12];
1160         kvm_sync_excp(env, POWERPC_EXCP_WDT,  SPR_BOOKE_IVOR12);
1161         env->spr[SPR_BOOKE_IVOR13] = sregs.u.e.ivor_low[13];
1162         kvm_sync_excp(env, POWERPC_EXCP_DTLB,  SPR_BOOKE_IVOR13);
1163         env->spr[SPR_BOOKE_IVOR14] = sregs.u.e.ivor_low[14];
1164         kvm_sync_excp(env, POWERPC_EXCP_ITLB,  SPR_BOOKE_IVOR14);
1165         env->spr[SPR_BOOKE_IVOR15] = sregs.u.e.ivor_low[15];
1166         kvm_sync_excp(env, POWERPC_EXCP_DEBUG,  SPR_BOOKE_IVOR15);
1167
1168         if (sregs.u.e.features & KVM_SREGS_E_SPE) {
1169             env->spr[SPR_BOOKE_IVOR32] = sregs.u.e.ivor_high[0];
1170             kvm_sync_excp(env, POWERPC_EXCP_SPEU,  SPR_BOOKE_IVOR32);
1171             env->spr[SPR_BOOKE_IVOR33] = sregs.u.e.ivor_high[1];
1172             kvm_sync_excp(env, POWERPC_EXCP_EFPDI,  SPR_BOOKE_IVOR33);
1173             env->spr[SPR_BOOKE_IVOR34] = sregs.u.e.ivor_high[2];
1174             kvm_sync_excp(env, POWERPC_EXCP_EFPRI,  SPR_BOOKE_IVOR34);
1175         }
1176
1177         if (sregs.u.e.features & KVM_SREGS_E_PM) {
1178             env->spr[SPR_BOOKE_IVOR35] = sregs.u.e.ivor_high[3];
1179             kvm_sync_excp(env, POWERPC_EXCP_EPERFM,  SPR_BOOKE_IVOR35);
1180         }
1181
1182         if (sregs.u.e.features & KVM_SREGS_E_PC) {
1183             env->spr[SPR_BOOKE_IVOR36] = sregs.u.e.ivor_high[4];
1184             kvm_sync_excp(env, POWERPC_EXCP_DOORI,  SPR_BOOKE_IVOR36);
1185             env->spr[SPR_BOOKE_IVOR37] = sregs.u.e.ivor_high[5];
1186             kvm_sync_excp(env, POWERPC_EXCP_DOORCI, SPR_BOOKE_IVOR37);
1187         }
1188     }
1189
1190     if (sregs.u.e.features & KVM_SREGS_E_ARCH206_MMU) {
1191         env->spr[SPR_BOOKE_MAS0] = sregs.u.e.mas0;
1192         env->spr[SPR_BOOKE_MAS1] = sregs.u.e.mas1;
1193         env->spr[SPR_BOOKE_MAS2] = sregs.u.e.mas2;
1194         env->spr[SPR_BOOKE_MAS3] = sregs.u.e.mas7_3 & 0xffffffff;
1195         env->spr[SPR_BOOKE_MAS4] = sregs.u.e.mas4;
1196         env->spr[SPR_BOOKE_MAS6] = sregs.u.e.mas6;
1197         env->spr[SPR_BOOKE_MAS7] = sregs.u.e.mas7_3 >> 32;
1198         env->spr[SPR_MMUCFG] = sregs.u.e.mmucfg;
1199         env->spr[SPR_BOOKE_TLB0CFG] = sregs.u.e.tlbcfg[0];
1200         env->spr[SPR_BOOKE_TLB1CFG] = sregs.u.e.tlbcfg[1];
1201     }
1202
1203     if (sregs.u.e.features & KVM_SREGS_EXP) {
1204         env->spr[SPR_BOOKE_EPR] = sregs.u.e.epr;
1205     }
1206
1207     if (sregs.u.e.features & KVM_SREGS_E_PD) {
1208         env->spr[SPR_BOOKE_EPLC] = sregs.u.e.eplc;
1209         env->spr[SPR_BOOKE_EPSC] = sregs.u.e.epsc;
1210     }
1211
1212     if (sregs.u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
1213         env->spr[SPR_E500_SVR] = sregs.u.e.impl.fsl.svr;
1214         env->spr[SPR_Exxx_MCAR] = sregs.u.e.impl.fsl.mcar;
1215         env->spr[SPR_HID0] = sregs.u.e.impl.fsl.hid0;
1216
1217         if (sregs.u.e.impl.fsl.features & KVM_SREGS_E_FSL_PIDn) {
1218             env->spr[SPR_BOOKE_PID1] = sregs.u.e.impl.fsl.pid1;
1219             env->spr[SPR_BOOKE_PID2] = sregs.u.e.impl.fsl.pid2;
1220         }
1221     }
1222
1223     return 0;
1224 }
1225
1226 static int kvmppc_get_books_sregs(PowerPCCPU *cpu)
1227 {
1228     CPUPPCState *env = &cpu->env;
1229     struct kvm_sregs sregs;
1230     int ret;
1231     int i;
1232
1233     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
1234     if (ret < 0) {
1235         return ret;
1236     }
1237
1238     if (!cpu->vhyp) {
1239         ppc_store_sdr1(env, sregs.u.s.sdr1);
1240     }
1241
1242     /* Sync SLB */
1243 #ifdef TARGET_PPC64
1244     /*
1245      * The packed SLB array we get from KVM_GET_SREGS only contains
1246      * information about valid entries. So we flush our internal copy
1247      * to get rid of stale ones, then put all valid SLB entries back
1248      * in.
1249      */
1250     memset(env->slb, 0, sizeof(env->slb));
1251     for (i = 0; i < ARRAY_SIZE(env->slb); i++) {
1252         target_ulong rb = sregs.u.s.ppc64.slb[i].slbe;
1253         target_ulong rs = sregs.u.s.ppc64.slb[i].slbv;
1254         /*
1255          * Only restore valid entries
1256          */
1257         if (rb & SLB_ESID_V) {
1258             ppc_store_slb(cpu, rb & 0xfff, rb & ~0xfffULL, rs);
1259         }
1260     }
1261 #endif
1262
1263     /* Sync SRs */
1264     for (i = 0; i < 16; i++) {
1265         env->sr[i] = sregs.u.s.ppc32.sr[i];
1266     }
1267
1268     /* Sync BATs */
1269     for (i = 0; i < 8; i++) {
1270         env->DBAT[0][i] = sregs.u.s.ppc32.dbat[i] & 0xffffffff;
1271         env->DBAT[1][i] = sregs.u.s.ppc32.dbat[i] >> 32;
1272         env->IBAT[0][i] = sregs.u.s.ppc32.ibat[i] & 0xffffffff;
1273         env->IBAT[1][i] = sregs.u.s.ppc32.ibat[i] >> 32;
1274     }
1275
1276     return 0;
1277 }
1278
1279 int kvm_arch_get_registers(CPUState *cs)
1280 {
1281     PowerPCCPU *cpu = POWERPC_CPU(cs);
1282     CPUPPCState *env = &cpu->env;
1283     struct kvm_regs regs;
1284     uint32_t cr;
1285     int i, ret;
1286
1287     ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, &regs);
1288     if (ret < 0)
1289         return ret;
1290
1291     cr = regs.cr;
1292     for (i = 7; i >= 0; i--) {
1293         env->crf[i] = cr & 15;
1294         cr >>= 4;
1295     }
1296
1297     env->ctr = regs.ctr;
1298     env->lr = regs.lr;
1299     cpu_write_xer(env, regs.xer);
1300     env->msr = regs.msr;
1301     env->nip = regs.pc;
1302
1303     env->spr[SPR_SRR0] = regs.srr0;
1304     env->spr[SPR_SRR1] = regs.srr1;
1305
1306     env->spr[SPR_SPRG0] = regs.sprg0;
1307     env->spr[SPR_SPRG1] = regs.sprg1;
1308     env->spr[SPR_SPRG2] = regs.sprg2;
1309     env->spr[SPR_SPRG3] = regs.sprg3;
1310     env->spr[SPR_SPRG4] = regs.sprg4;
1311     env->spr[SPR_SPRG5] = regs.sprg5;
1312     env->spr[SPR_SPRG6] = regs.sprg6;
1313     env->spr[SPR_SPRG7] = regs.sprg7;
1314
1315     env->spr[SPR_BOOKE_PID] = regs.pid;
1316
1317     for (i = 0;i < 32; i++)
1318         env->gpr[i] = regs.gpr[i];
1319
1320     kvm_get_fp(cs);
1321
1322     if (cap_booke_sregs) {
1323         ret = kvmppc_get_booke_sregs(cpu);
1324         if (ret < 0) {
1325             return ret;
1326         }
1327     }
1328
1329     if (cap_segstate) {
1330         ret = kvmppc_get_books_sregs(cpu);
1331         if (ret < 0) {
1332             return ret;
1333         }
1334     }
1335
1336     if (cap_hior) {
1337         kvm_get_one_spr(cs, KVM_REG_PPC_HIOR, SPR_HIOR);
1338     }
1339
1340     if (cap_one_reg) {
1341         int i;
1342
1343         /* We deliberately ignore errors here, for kernels which have
1344          * the ONE_REG calls, but don't support the specific
1345          * registers, there's a reasonable chance things will still
1346          * work, at least until we try to migrate. */
1347         for (i = 0; i < 1024; i++) {
1348             uint64_t id = env->spr_cb[i].one_reg_id;
1349
1350             if (id != 0) {
1351                 kvm_get_one_spr(cs, id, i);
1352             }
1353         }
1354
1355 #ifdef TARGET_PPC64
1356         if (msr_ts) {
1357             for (i = 0; i < ARRAY_SIZE(env->tm_gpr); i++) {
1358                 kvm_get_one_reg(cs, KVM_REG_PPC_TM_GPR(i), &env->tm_gpr[i]);
1359             }
1360             for (i = 0; i < ARRAY_SIZE(env->tm_vsr); i++) {
1361                 kvm_get_one_reg(cs, KVM_REG_PPC_TM_VSR(i), &env->tm_vsr[i]);
1362             }
1363             kvm_get_one_reg(cs, KVM_REG_PPC_TM_CR, &env->tm_cr);
1364             kvm_get_one_reg(cs, KVM_REG_PPC_TM_LR, &env->tm_lr);
1365             kvm_get_one_reg(cs, KVM_REG_PPC_TM_CTR, &env->tm_ctr);
1366             kvm_get_one_reg(cs, KVM_REG_PPC_TM_FPSCR, &env->tm_fpscr);
1367             kvm_get_one_reg(cs, KVM_REG_PPC_TM_AMR, &env->tm_amr);
1368             kvm_get_one_reg(cs, KVM_REG_PPC_TM_PPR, &env->tm_ppr);
1369             kvm_get_one_reg(cs, KVM_REG_PPC_TM_VRSAVE, &env->tm_vrsave);
1370             kvm_get_one_reg(cs, KVM_REG_PPC_TM_VSCR, &env->tm_vscr);
1371             kvm_get_one_reg(cs, KVM_REG_PPC_TM_DSCR, &env->tm_dscr);
1372             kvm_get_one_reg(cs, KVM_REG_PPC_TM_TAR, &env->tm_tar);
1373         }
1374
1375         if (cap_papr) {
1376             if (kvm_get_vpa(cs) < 0) {
1377                 DPRINTF("Warning: Unable to get VPA information from KVM\n");
1378             }
1379         }
1380
1381         kvm_get_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &env->tb_env->tb_offset);
1382 #endif
1383     }
1384
1385     return 0;
1386 }
1387
1388 int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int level)
1389 {
1390     unsigned virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
1391
1392     if (irq != PPC_INTERRUPT_EXT) {
1393         return 0;
1394     }
1395
1396     if (!kvm_enabled() || !cap_interrupt_unset || !cap_interrupt_level) {
1397         return 0;
1398     }
1399
1400     kvm_vcpu_ioctl(CPU(cpu), KVM_INTERRUPT, &virq);
1401
1402     return 0;
1403 }
1404
1405 #if defined(TARGET_PPCEMB)
1406 #define PPC_INPUT_INT PPC40x_INPUT_INT
1407 #elif defined(TARGET_PPC64)
1408 #define PPC_INPUT_INT PPC970_INPUT_INT
1409 #else
1410 #define PPC_INPUT_INT PPC6xx_INPUT_INT
1411 #endif
1412
1413 void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
1414 {
1415     PowerPCCPU *cpu = POWERPC_CPU(cs);
1416     CPUPPCState *env = &cpu->env;
1417     int r;
1418     unsigned irq;
1419
1420     qemu_mutex_lock_iothread();
1421
1422     /* PowerPC QEMU tracks the various core input pins (interrupt, critical
1423      * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
1424     if (!cap_interrupt_level &&
1425         run->ready_for_interrupt_injection &&
1426         (cs->interrupt_request & CPU_INTERRUPT_HARD) &&
1427         (env->irq_input_state & (1<<PPC_INPUT_INT)))
1428     {
1429         /* For now KVM disregards the 'irq' argument. However, in the
1430          * future KVM could cache it in-kernel to avoid a heavyweight exit
1431          * when reading the UIC.
1432          */
1433         irq = KVM_INTERRUPT_SET;
1434
1435         DPRINTF("injected interrupt %d\n", irq);
1436         r = kvm_vcpu_ioctl(cs, KVM_INTERRUPT, &irq);
1437         if (r < 0) {
1438             printf("cpu %d fail inject %x\n", cs->cpu_index, irq);
1439         }
1440
1441         /* Always wake up soon in case the interrupt was level based */
1442         timer_mod(idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
1443                        (NANOSECONDS_PER_SECOND / 50));
1444     }
1445
1446     /* We don't know if there are more interrupts pending after this. However,
1447      * the guest will return to userspace in the course of handling this one
1448      * anyways, so we will get a chance to deliver the rest. */
1449
1450     qemu_mutex_unlock_iothread();
1451 }
1452
1453 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
1454 {
1455     return MEMTXATTRS_UNSPECIFIED;
1456 }
1457
1458 int kvm_arch_process_async_events(CPUState *cs)
1459 {
1460     return cs->halted;
1461 }
1462
1463 static int kvmppc_handle_halt(PowerPCCPU *cpu)
1464 {
1465     CPUState *cs = CPU(cpu);
1466     CPUPPCState *env = &cpu->env;
1467
1468     if (!(cs->interrupt_request & CPU_INTERRUPT_HARD) && (msr_ee)) {
1469         cs->halted = 1;
1470         cs->exception_index = EXCP_HLT;
1471     }
1472
1473     return 0;
1474 }
1475
1476 /* map dcr access to existing qemu dcr emulation */
1477 static int kvmppc_handle_dcr_read(CPUPPCState *env, uint32_t dcrn, uint32_t *data)
1478 {
1479     if (ppc_dcr_read(env->dcr_env, dcrn, data) < 0)
1480         fprintf(stderr, "Read to unhandled DCR (0x%x)\n", dcrn);
1481
1482     return 0;
1483 }
1484
1485 static int kvmppc_handle_dcr_write(CPUPPCState *env, uint32_t dcrn, uint32_t data)
1486 {
1487     if (ppc_dcr_write(env->dcr_env, dcrn, data) < 0)
1488         fprintf(stderr, "Write to unhandled DCR (0x%x)\n", dcrn);
1489
1490     return 0;
1491 }
1492
1493 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1494 {
1495     /* Mixed endian case is not handled */
1496     uint32_t sc = debug_inst_opcode;
1497
1498     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn,
1499                             sizeof(sc), 0) ||
1500         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&sc, sizeof(sc), 1)) {
1501         return -EINVAL;
1502     }
1503
1504     return 0;
1505 }
1506
1507 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1508 {
1509     uint32_t sc;
1510
1511     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&sc, sizeof(sc), 0) ||
1512         sc != debug_inst_opcode ||
1513         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn,
1514                             sizeof(sc), 1)) {
1515         return -EINVAL;
1516     }
1517
1518     return 0;
1519 }
1520
1521 static int find_hw_breakpoint(target_ulong addr, int type)
1522 {
1523     int n;
1524
1525     assert((nb_hw_breakpoint + nb_hw_watchpoint)
1526            <= ARRAY_SIZE(hw_debug_points));
1527
1528     for (n = 0; n < nb_hw_breakpoint + nb_hw_watchpoint; n++) {
1529         if (hw_debug_points[n].addr == addr &&
1530              hw_debug_points[n].type == type) {
1531             return n;
1532         }
1533     }
1534
1535     return -1;
1536 }
1537
1538 static int find_hw_watchpoint(target_ulong addr, int *flag)
1539 {
1540     int n;
1541
1542     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_ACCESS);
1543     if (n >= 0) {
1544         *flag = BP_MEM_ACCESS;
1545         return n;
1546     }
1547
1548     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_WRITE);
1549     if (n >= 0) {
1550         *flag = BP_MEM_WRITE;
1551         return n;
1552     }
1553
1554     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_READ);
1555     if (n >= 0) {
1556         *flag = BP_MEM_READ;
1557         return n;
1558     }
1559
1560     return -1;
1561 }
1562
1563 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
1564                                   target_ulong len, int type)
1565 {
1566     if ((nb_hw_breakpoint + nb_hw_watchpoint) >= ARRAY_SIZE(hw_debug_points)) {
1567         return -ENOBUFS;
1568     }
1569
1570     hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint].addr = addr;
1571     hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint].type = type;
1572
1573     switch (type) {
1574     case GDB_BREAKPOINT_HW:
1575         if (nb_hw_breakpoint >= max_hw_breakpoint) {
1576             return -ENOBUFS;
1577         }
1578
1579         if (find_hw_breakpoint(addr, type) >= 0) {
1580             return -EEXIST;
1581         }
1582
1583         nb_hw_breakpoint++;
1584         break;
1585
1586     case GDB_WATCHPOINT_WRITE:
1587     case GDB_WATCHPOINT_READ:
1588     case GDB_WATCHPOINT_ACCESS:
1589         if (nb_hw_watchpoint >= max_hw_watchpoint) {
1590             return -ENOBUFS;
1591         }
1592
1593         if (find_hw_breakpoint(addr, type) >= 0) {
1594             return -EEXIST;
1595         }
1596
1597         nb_hw_watchpoint++;
1598         break;
1599
1600     default:
1601         return -ENOSYS;
1602     }
1603
1604     return 0;
1605 }
1606
1607 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
1608                                   target_ulong len, int type)
1609 {
1610     int n;
1611
1612     n = find_hw_breakpoint(addr, type);
1613     if (n < 0) {
1614         return -ENOENT;
1615     }
1616
1617     switch (type) {
1618     case GDB_BREAKPOINT_HW:
1619         nb_hw_breakpoint--;
1620         break;
1621
1622     case GDB_WATCHPOINT_WRITE:
1623     case GDB_WATCHPOINT_READ:
1624     case GDB_WATCHPOINT_ACCESS:
1625         nb_hw_watchpoint--;
1626         break;
1627
1628     default:
1629         return -ENOSYS;
1630     }
1631     hw_debug_points[n] = hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint];
1632
1633     return 0;
1634 }
1635
1636 void kvm_arch_remove_all_hw_breakpoints(void)
1637 {
1638     nb_hw_breakpoint = nb_hw_watchpoint = 0;
1639 }
1640
1641 void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
1642 {
1643     int n;
1644
1645     /* Software Breakpoint updates */
1646     if (kvm_sw_breakpoints_active(cs)) {
1647         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
1648     }
1649
1650     assert((nb_hw_breakpoint + nb_hw_watchpoint)
1651            <= ARRAY_SIZE(hw_debug_points));
1652     assert((nb_hw_breakpoint + nb_hw_watchpoint) <= ARRAY_SIZE(dbg->arch.bp));
1653
1654     if (nb_hw_breakpoint + nb_hw_watchpoint > 0) {
1655         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
1656         memset(dbg->arch.bp, 0, sizeof(dbg->arch.bp));
1657         for (n = 0; n < nb_hw_breakpoint + nb_hw_watchpoint; n++) {
1658             switch (hw_debug_points[n].type) {
1659             case GDB_BREAKPOINT_HW:
1660                 dbg->arch.bp[n].type = KVMPPC_DEBUG_BREAKPOINT;
1661                 break;
1662             case GDB_WATCHPOINT_WRITE:
1663                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_WRITE;
1664                 break;
1665             case GDB_WATCHPOINT_READ:
1666                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_READ;
1667                 break;
1668             case GDB_WATCHPOINT_ACCESS:
1669                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_WRITE |
1670                                         KVMPPC_DEBUG_WATCH_READ;
1671                 break;
1672             default:
1673                 cpu_abort(cs, "Unsupported breakpoint type\n");
1674             }
1675             dbg->arch.bp[n].addr = hw_debug_points[n].addr;
1676         }
1677     }
1678 }
1679
1680 static int kvm_handle_debug(PowerPCCPU *cpu, struct kvm_run *run)
1681 {
1682     CPUState *cs = CPU(cpu);
1683     CPUPPCState *env = &cpu->env;
1684     struct kvm_debug_exit_arch *arch_info = &run->debug.arch;
1685     int handle = 0;
1686     int n;
1687     int flag = 0;
1688
1689     if (cs->singlestep_enabled) {
1690         handle = 1;
1691     } else if (arch_info->status) {
1692         if (nb_hw_breakpoint + nb_hw_watchpoint > 0) {
1693             if (arch_info->status & KVMPPC_DEBUG_BREAKPOINT) {
1694                 n = find_hw_breakpoint(arch_info->address, GDB_BREAKPOINT_HW);
1695                 if (n >= 0) {
1696                     handle = 1;
1697                 }
1698             } else if (arch_info->status & (KVMPPC_DEBUG_WATCH_READ |
1699                                             KVMPPC_DEBUG_WATCH_WRITE)) {
1700                 n = find_hw_watchpoint(arch_info->address,  &flag);
1701                 if (n >= 0) {
1702                     handle = 1;
1703                     cs->watchpoint_hit = &hw_watchpoint;
1704                     hw_watchpoint.vaddr = hw_debug_points[n].addr;
1705                     hw_watchpoint.flags = flag;
1706                 }
1707             }
1708         }
1709     } else if (kvm_find_sw_breakpoint(cs, arch_info->address)) {
1710         handle = 1;
1711     } else {
1712         /* QEMU is not able to handle debug exception, so inject
1713          * program exception to guest;
1714          * Yes program exception NOT debug exception !!
1715          * When QEMU is using debug resources then debug exception must
1716          * be always set. To achieve this we set MSR_DE and also set
1717          * MSRP_DEP so guest cannot change MSR_DE.
1718          * When emulating debug resource for guest we want guest
1719          * to control MSR_DE (enable/disable debug interrupt on need).
1720          * Supporting both configurations are NOT possible.
1721          * So the result is that we cannot share debug resources
1722          * between QEMU and Guest on BOOKE architecture.
1723          * In the current design QEMU gets the priority over guest,
1724          * this means that if QEMU is using debug resources then guest
1725          * cannot use them;
1726          * For software breakpoint QEMU uses a privileged instruction;
1727          * So there cannot be any reason that we are here for guest
1728          * set debug exception, only possibility is guest executed a
1729          * privileged / illegal instruction and that's why we are
1730          * injecting a program interrupt.
1731          */
1732
1733         cpu_synchronize_state(cs);
1734         /* env->nip is PC, so increment this by 4 to use
1735          * ppc_cpu_do_interrupt(), which set srr0 = env->nip - 4.
1736          */
1737         env->nip += 4;
1738         cs->exception_index = POWERPC_EXCP_PROGRAM;
1739         env->error_code = POWERPC_EXCP_INVAL;
1740         ppc_cpu_do_interrupt(cs);
1741     }
1742
1743     return handle;
1744 }
1745
1746 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
1747 {
1748     PowerPCCPU *cpu = POWERPC_CPU(cs);
1749     CPUPPCState *env = &cpu->env;
1750     int ret;
1751
1752     qemu_mutex_lock_iothread();
1753
1754     switch (run->exit_reason) {
1755     case KVM_EXIT_DCR:
1756         if (run->dcr.is_write) {
1757             DPRINTF("handle dcr write\n");
1758             ret = kvmppc_handle_dcr_write(env, run->dcr.dcrn, run->dcr.data);
1759         } else {
1760             DPRINTF("handle dcr read\n");
1761             ret = kvmppc_handle_dcr_read(env, run->dcr.dcrn, &run->dcr.data);
1762         }
1763         break;
1764     case KVM_EXIT_HLT:
1765         DPRINTF("handle halt\n");
1766         ret = kvmppc_handle_halt(cpu);
1767         break;
1768 #if defined(TARGET_PPC64)
1769     case KVM_EXIT_PAPR_HCALL:
1770         DPRINTF("handle PAPR hypercall\n");
1771         run->papr_hcall.ret = spapr_hypercall(cpu,
1772                                               run->papr_hcall.nr,
1773                                               run->papr_hcall.args);
1774         ret = 0;
1775         break;
1776 #endif
1777     case KVM_EXIT_EPR:
1778         DPRINTF("handle epr\n");
1779         run->epr.epr = ldl_phys(cs->as, env->mpic_iack);
1780         ret = 0;
1781         break;
1782     case KVM_EXIT_WATCHDOG:
1783         DPRINTF("handle watchdog expiry\n");
1784         watchdog_perform_action();
1785         ret = 0;
1786         break;
1787
1788     case KVM_EXIT_DEBUG:
1789         DPRINTF("handle debug exception\n");
1790         if (kvm_handle_debug(cpu, run)) {
1791             ret = EXCP_DEBUG;
1792             break;
1793         }
1794         /* re-enter, this exception was guest-internal */
1795         ret = 0;
1796         break;
1797
1798     default:
1799         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
1800         ret = -1;
1801         break;
1802     }
1803
1804     qemu_mutex_unlock_iothread();
1805     return ret;
1806 }
1807
1808 int kvmppc_or_tsr_bits(PowerPCCPU *cpu, uint32_t tsr_bits)
1809 {
1810     CPUState *cs = CPU(cpu);
1811     uint32_t bits = tsr_bits;
1812     struct kvm_one_reg reg = {
1813         .id = KVM_REG_PPC_OR_TSR,
1814         .addr = (uintptr_t) &bits,
1815     };
1816
1817     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1818 }
1819
1820 int kvmppc_clear_tsr_bits(PowerPCCPU *cpu, uint32_t tsr_bits)
1821 {
1822
1823     CPUState *cs = CPU(cpu);
1824     uint32_t bits = tsr_bits;
1825     struct kvm_one_reg reg = {
1826         .id = KVM_REG_PPC_CLEAR_TSR,
1827         .addr = (uintptr_t) &bits,
1828     };
1829
1830     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1831 }
1832
1833 int kvmppc_set_tcr(PowerPCCPU *cpu)
1834 {
1835     CPUState *cs = CPU(cpu);
1836     CPUPPCState *env = &cpu->env;
1837     uint32_t tcr = env->spr[SPR_BOOKE_TCR];
1838
1839     struct kvm_one_reg reg = {
1840         .id = KVM_REG_PPC_TCR,
1841         .addr = (uintptr_t) &tcr,
1842     };
1843
1844     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1845 }
1846
1847 int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu)
1848 {
1849     CPUState *cs = CPU(cpu);
1850     int ret;
1851
1852     if (!kvm_enabled()) {
1853         return -1;
1854     }
1855
1856     if (!cap_ppc_watchdog) {
1857         printf("warning: KVM does not support watchdog");
1858         return -1;
1859     }
1860
1861     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_BOOKE_WATCHDOG, 0);
1862     if (ret < 0) {
1863         fprintf(stderr, "%s: couldn't enable KVM_CAP_PPC_BOOKE_WATCHDOG: %s\n",
1864                 __func__, strerror(-ret));
1865         return ret;
1866     }
1867
1868     return ret;
1869 }
1870
1871 static int read_cpuinfo(const char *field, char *value, int len)
1872 {
1873     FILE *f;
1874     int ret = -1;
1875     int field_len = strlen(field);
1876     char line[512];
1877
1878     f = fopen("/proc/cpuinfo", "r");
1879     if (!f) {
1880         return -1;
1881     }
1882
1883     do {
1884         if (!fgets(line, sizeof(line), f)) {
1885             break;
1886         }
1887         if (!strncmp(line, field, field_len)) {
1888             pstrcpy(value, len, line);
1889             ret = 0;
1890             break;
1891         }
1892     } while(*line);
1893
1894     fclose(f);
1895
1896     return ret;
1897 }
1898
1899 uint32_t kvmppc_get_tbfreq(void)
1900 {
1901     char line[512];
1902     char *ns;
1903     uint32_t retval = NANOSECONDS_PER_SECOND;
1904
1905     if (read_cpuinfo("timebase", line, sizeof(line))) {
1906         return retval;
1907     }
1908
1909     if (!(ns = strchr(line, ':'))) {
1910         return retval;
1911     }
1912
1913     ns++;
1914
1915     return atoi(ns);
1916 }
1917
1918 bool kvmppc_get_host_serial(char **value)
1919 {
1920     return g_file_get_contents("/proc/device-tree/system-id", value, NULL,
1921                                NULL);
1922 }
1923
1924 bool kvmppc_get_host_model(char **value)
1925 {
1926     return g_file_get_contents("/proc/device-tree/model", value, NULL, NULL);
1927 }
1928
1929 /* Try to find a device tree node for a CPU with clock-frequency property */
1930 static int kvmppc_find_cpu_dt(char *buf, int buf_len)
1931 {
1932     struct dirent *dirp;
1933     DIR *dp;
1934
1935     if ((dp = opendir(PROC_DEVTREE_CPU)) == NULL) {
1936         printf("Can't open directory " PROC_DEVTREE_CPU "\n");
1937         return -1;
1938     }
1939
1940     buf[0] = '\0';
1941     while ((dirp = readdir(dp)) != NULL) {
1942         FILE *f;
1943         snprintf(buf, buf_len, "%s%s/clock-frequency", PROC_DEVTREE_CPU,
1944                  dirp->d_name);
1945         f = fopen(buf, "r");
1946         if (f) {
1947             snprintf(buf, buf_len, "%s%s", PROC_DEVTREE_CPU, dirp->d_name);
1948             fclose(f);
1949             break;
1950         }
1951         buf[0] = '\0';
1952     }
1953     closedir(dp);
1954     if (buf[0] == '\0') {
1955         printf("Unknown host!\n");
1956         return -1;
1957     }
1958
1959     return 0;
1960 }
1961
1962 static uint64_t kvmppc_read_int_dt(const char *filename)
1963 {
1964     union {
1965         uint32_t v32;
1966         uint64_t v64;
1967     } u;
1968     FILE *f;
1969     int len;
1970
1971     f = fopen(filename, "rb");
1972     if (!f) {
1973         return -1;
1974     }
1975
1976     len = fread(&u, 1, sizeof(u), f);
1977     fclose(f);
1978     switch (len) {
1979     case 4:
1980         /* property is a 32-bit quantity */
1981         return be32_to_cpu(u.v32);
1982     case 8:
1983         return be64_to_cpu(u.v64);
1984     }
1985
1986     return 0;
1987 }
1988
1989 /* Read a CPU node property from the host device tree that's a single
1990  * integer (32-bit or 64-bit).  Returns 0 if anything goes wrong
1991  * (can't find or open the property, or doesn't understand the
1992  * format) */
1993 static uint64_t kvmppc_read_int_cpu_dt(const char *propname)
1994 {
1995     char buf[PATH_MAX], *tmp;
1996     uint64_t val;
1997
1998     if (kvmppc_find_cpu_dt(buf, sizeof(buf))) {
1999         return -1;
2000     }
2001
2002     tmp = g_strdup_printf("%s/%s", buf, propname);
2003     val = kvmppc_read_int_dt(tmp);
2004     g_free(tmp);
2005
2006     return val;
2007 }
2008
2009 uint64_t kvmppc_get_clockfreq(void)
2010 {
2011     return kvmppc_read_int_cpu_dt("clock-frequency");
2012 }
2013
2014 uint32_t kvmppc_get_vmx(void)
2015 {
2016     return kvmppc_read_int_cpu_dt("ibm,vmx");
2017 }
2018
2019 uint32_t kvmppc_get_dfp(void)
2020 {
2021     return kvmppc_read_int_cpu_dt("ibm,dfp");
2022 }
2023
2024 static int kvmppc_get_pvinfo(CPUPPCState *env, struct kvm_ppc_pvinfo *pvinfo)
2025  {
2026      PowerPCCPU *cpu = ppc_env_get_cpu(env);
2027      CPUState *cs = CPU(cpu);
2028
2029     if (kvm_vm_check_extension(cs->kvm_state, KVM_CAP_PPC_GET_PVINFO) &&
2030         !kvm_vm_ioctl(cs->kvm_state, KVM_PPC_GET_PVINFO, pvinfo)) {
2031         return 0;
2032     }
2033
2034     return 1;
2035 }
2036
2037 int kvmppc_get_hasidle(CPUPPCState *env)
2038 {
2039     struct kvm_ppc_pvinfo pvinfo;
2040
2041     if (!kvmppc_get_pvinfo(env, &pvinfo) &&
2042         (pvinfo.flags & KVM_PPC_PVINFO_FLAGS_EV_IDLE)) {
2043         return 1;
2044     }
2045
2046     return 0;
2047 }
2048
2049 int kvmppc_get_hypercall(CPUPPCState *env, uint8_t *buf, int buf_len)
2050 {
2051     uint32_t *hc = (uint32_t*)buf;
2052     struct kvm_ppc_pvinfo pvinfo;
2053
2054     if (!kvmppc_get_pvinfo(env, &pvinfo)) {
2055         memcpy(buf, pvinfo.hcall, buf_len);
2056         return 0;
2057     }
2058
2059     /*
2060      * Fallback to always fail hypercalls regardless of endianness:
2061      *
2062      *     tdi 0,r0,72 (becomes b .+8 in wrong endian, nop in good endian)
2063      *     li r3, -1
2064      *     b .+8       (becomes nop in wrong endian)
2065      *     bswap32(li r3, -1)
2066      */
2067
2068     hc[0] = cpu_to_be32(0x08000048);
2069     hc[1] = cpu_to_be32(0x3860ffff);
2070     hc[2] = cpu_to_be32(0x48000008);
2071     hc[3] = cpu_to_be32(bswap32(0x3860ffff));
2072
2073     return 1;
2074 }
2075
2076 static inline int kvmppc_enable_hcall(KVMState *s, target_ulong hcall)
2077 {
2078     return kvm_vm_enable_cap(s, KVM_CAP_PPC_ENABLE_HCALL, 0, hcall, 1);
2079 }
2080
2081 void kvmppc_enable_logical_ci_hcalls(void)
2082 {
2083     /*
2084      * FIXME: it would be nice if we could detect the cases where
2085      * we're using a device which requires the in kernel
2086      * implementation of these hcalls, but the kernel lacks them and
2087      * produce a warning.
2088      */
2089     kvmppc_enable_hcall(kvm_state, H_LOGICAL_CI_LOAD);
2090     kvmppc_enable_hcall(kvm_state, H_LOGICAL_CI_STORE);
2091 }
2092
2093 void kvmppc_enable_set_mode_hcall(void)
2094 {
2095     kvmppc_enable_hcall(kvm_state, H_SET_MODE);
2096 }
2097
2098 void kvmppc_enable_clear_ref_mod_hcalls(void)
2099 {
2100     kvmppc_enable_hcall(kvm_state, H_CLEAR_REF);
2101     kvmppc_enable_hcall(kvm_state, H_CLEAR_MOD);
2102 }
2103
2104 void kvmppc_set_papr(PowerPCCPU *cpu)
2105 {
2106     CPUState *cs = CPU(cpu);
2107     int ret;
2108
2109     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_PAPR, 0);
2110     if (ret) {
2111         error_report("This vCPU type or KVM version does not support PAPR");
2112         exit(1);
2113     }
2114
2115     /* Update the capability flag so we sync the right information
2116      * with kvm */
2117     cap_papr = 1;
2118 }
2119
2120 int kvmppc_set_compat(PowerPCCPU *cpu, uint32_t compat_pvr)
2121 {
2122     return kvm_set_one_reg(CPU(cpu), KVM_REG_PPC_ARCH_COMPAT, &compat_pvr);
2123 }
2124
2125 void kvmppc_set_mpic_proxy(PowerPCCPU *cpu, int mpic_proxy)
2126 {
2127     CPUState *cs = CPU(cpu);
2128     int ret;
2129
2130     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_EPR, 0, mpic_proxy);
2131     if (ret && mpic_proxy) {
2132         error_report("This KVM version does not support EPR");
2133         exit(1);
2134     }
2135 }
2136
2137 int kvmppc_smt_threads(void)
2138 {
2139     return cap_ppc_smt ? cap_ppc_smt : 1;
2140 }
2141
2142 int kvmppc_set_smt_threads(int smt)
2143 {
2144     int ret;
2145
2146     ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_SMT, 0, smt, 0);
2147     if (!ret) {
2148         cap_ppc_smt = smt;
2149     }
2150     return ret;
2151 }
2152
2153 void kvmppc_hint_smt_possible(Error **errp)
2154 {
2155     int i;
2156     GString *g;
2157     char *s;
2158
2159     assert(kvm_enabled());
2160     if (cap_ppc_smt_possible) {
2161         g = g_string_new("Available VSMT modes:");
2162         for (i = 63; i >= 0; i--) {
2163             if ((1UL << i) & cap_ppc_smt_possible) {
2164                 g_string_append_printf(g, " %lu", (1UL << i));
2165             }
2166         }
2167         s = g_string_free(g, false);
2168         error_append_hint(errp, "%s.\n", s);
2169         g_free(s);
2170     } else {
2171         error_append_hint(errp,
2172                           "This KVM seems to be too old to support VSMT.\n");
2173     }
2174 }
2175
2176
2177 #ifdef TARGET_PPC64
2178 off_t kvmppc_alloc_rma(void **rma)
2179 {
2180     off_t size;
2181     int fd;
2182     struct kvm_allocate_rma ret;
2183
2184     /* If cap_ppc_rma == 0, contiguous RMA allocation is not supported
2185      * if cap_ppc_rma == 1, contiguous RMA allocation is supported, but
2186      *                      not necessary on this hardware
2187      * if cap_ppc_rma == 2, contiguous RMA allocation is needed on this hardware
2188      *
2189      * FIXME: We should allow the user to force contiguous RMA
2190      * allocation in the cap_ppc_rma==1 case.
2191      */
2192     if (cap_ppc_rma < 2) {
2193         return 0;
2194     }
2195
2196     fd = kvm_vm_ioctl(kvm_state, KVM_ALLOCATE_RMA, &ret);
2197     if (fd < 0) {
2198         fprintf(stderr, "KVM: Error on KVM_ALLOCATE_RMA: %s\n",
2199                 strerror(errno));
2200         return -1;
2201     }
2202
2203     size = MIN(ret.rma_size, 256ul << 20);
2204
2205     *rma = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2206     if (*rma == MAP_FAILED) {
2207         fprintf(stderr, "KVM: Error mapping RMA: %s\n", strerror(errno));
2208         return -1;
2209     };
2210
2211     return size;
2212 }
2213
2214 uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift)
2215 {
2216     struct kvm_ppc_smmu_info info;
2217     long rampagesize, best_page_shift;
2218     int i;
2219
2220     if (cap_ppc_rma >= 2) {
2221         return current_size;
2222     }
2223
2224     /* Find the largest hardware supported page size that's less than
2225      * or equal to the (logical) backing page size of guest RAM */
2226     kvm_get_smmu_info(POWERPC_CPU(first_cpu), &info);
2227     rampagesize = qemu_getrampagesize();
2228     best_page_shift = 0;
2229
2230     for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
2231         struct kvm_ppc_one_seg_page_size *sps = &info.sps[i];
2232
2233         if (!sps->page_shift) {
2234             continue;
2235         }
2236
2237         if ((sps->page_shift > best_page_shift)
2238             && ((1UL << sps->page_shift) <= rampagesize)) {
2239             best_page_shift = sps->page_shift;
2240         }
2241     }
2242
2243     return MIN(current_size,
2244                1ULL << (best_page_shift + hash_shift - 7));
2245 }
2246 #endif
2247
2248 bool kvmppc_spapr_use_multitce(void)
2249 {
2250     return cap_spapr_multitce;
2251 }
2252
2253 int kvmppc_spapr_enable_inkernel_multitce(void)
2254 {
2255     int ret;
2256
2257     ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_ENABLE_HCALL, 0,
2258                             H_PUT_TCE_INDIRECT, 1);
2259     if (!ret) {
2260         ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PPC_ENABLE_HCALL, 0,
2261                                 H_STUFF_TCE, 1);
2262     }
2263
2264     return ret;
2265 }
2266
2267 void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t page_shift,
2268                               uint64_t bus_offset, uint32_t nb_table,
2269                               int *pfd, bool need_vfio)
2270 {
2271     long len;
2272     int fd;
2273     void *table;
2274
2275     /* Must set fd to -1 so we don't try to munmap when called for
2276      * destroying the table, which the upper layers -will- do
2277      */
2278     *pfd = -1;
2279     if (!cap_spapr_tce || (need_vfio && !cap_spapr_vfio)) {
2280         return NULL;
2281     }
2282
2283     if (cap_spapr_tce_64) {
2284         struct kvm_create_spapr_tce_64 args = {
2285             .liobn = liobn,
2286             .page_shift = page_shift,
2287             .offset = bus_offset >> page_shift,
2288             .size = nb_table,
2289             .flags = 0
2290         };
2291         fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE_64, &args);
2292         if (fd < 0) {
2293             fprintf(stderr,
2294                     "KVM: Failed to create TCE64 table for liobn 0x%x\n",
2295                     liobn);
2296             return NULL;
2297         }
2298     } else if (cap_spapr_tce) {
2299         uint64_t window_size = (uint64_t) nb_table << page_shift;
2300         struct kvm_create_spapr_tce args = {
2301             .liobn = liobn,
2302             .window_size = window_size,
2303         };
2304         if ((window_size != args.window_size) || bus_offset) {
2305             return NULL;
2306         }
2307         fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE, &args);
2308         if (fd < 0) {
2309             fprintf(stderr, "KVM: Failed to create TCE table for liobn 0x%x\n",
2310                     liobn);
2311             return NULL;
2312         }
2313     } else {
2314         return NULL;
2315     }
2316
2317     len = nb_table * sizeof(uint64_t);
2318     /* FIXME: round this up to page size */
2319
2320     table = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2321     if (table == MAP_FAILED) {
2322         fprintf(stderr, "KVM: Failed to map TCE table for liobn 0x%x\n",
2323                 liobn);
2324         close(fd);
2325         return NULL;
2326     }
2327
2328     *pfd = fd;
2329     return table;
2330 }
2331
2332 int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t nb_table)
2333 {
2334     long len;
2335
2336     if (fd < 0) {
2337         return -1;
2338     }
2339
2340     len = nb_table * sizeof(uint64_t);
2341     if ((munmap(table, len) < 0) ||
2342         (close(fd) < 0)) {
2343         fprintf(stderr, "KVM: Unexpected error removing TCE table: %s",
2344                 strerror(errno));
2345         /* Leak the table */
2346     }
2347
2348     return 0;
2349 }
2350
2351 int kvmppc_reset_htab(int shift_hint)
2352 {
2353     uint32_t shift = shift_hint;
2354
2355     if (!kvm_enabled()) {
2356         /* Full emulation, tell caller to allocate htab itself */
2357         return 0;
2358     }
2359     if (kvm_vm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) {
2360         int ret;
2361         ret = kvm_vm_ioctl(kvm_state, KVM_PPC_ALLOCATE_HTAB, &shift);
2362         if (ret == -ENOTTY) {
2363             /* At least some versions of PR KVM advertise the
2364              * capability, but don't implement the ioctl().  Oops.
2365              * Return 0 so that we allocate the htab in qemu, as is
2366              * correct for PR. */
2367             return 0;
2368         } else if (ret < 0) {
2369             return ret;
2370         }
2371         return shift;
2372     }
2373
2374     /* We have a kernel that predates the htab reset calls.  For PR
2375      * KVM, we need to allocate the htab ourselves, for an HV KVM of
2376      * this era, it has allocated a 16MB fixed size hash table already. */
2377     if (kvmppc_is_pr(kvm_state)) {
2378         /* PR - tell caller to allocate htab */
2379         return 0;
2380     } else {
2381         /* HV - assume 16MB kernel allocated htab */
2382         return 24;
2383     }
2384 }
2385
2386 static inline uint32_t mfpvr(void)
2387 {
2388     uint32_t pvr;
2389
2390     asm ("mfpvr %0"
2391          : "=r"(pvr));
2392     return pvr;
2393 }
2394
2395 static void alter_insns(uint64_t *word, uint64_t flags, bool on)
2396 {
2397     if (on) {
2398         *word |= flags;
2399     } else {
2400         *word &= ~flags;
2401     }
2402 }
2403
2404 static void kvmppc_host_cpu_class_init(ObjectClass *oc, void *data)
2405 {
2406     PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc);
2407     uint32_t vmx = kvmppc_get_vmx();
2408     uint32_t dfp = kvmppc_get_dfp();
2409     uint32_t dcache_size = kvmppc_read_int_cpu_dt("d-cache-size");
2410     uint32_t icache_size = kvmppc_read_int_cpu_dt("i-cache-size");
2411
2412     /* Now fix up the class with information we can query from the host */
2413     pcc->pvr = mfpvr();
2414
2415     if (vmx != -1) {
2416         /* Only override when we know what the host supports */
2417         alter_insns(&pcc->insns_flags, PPC_ALTIVEC, vmx > 0);
2418         alter_insns(&pcc->insns_flags2, PPC2_VSX, vmx > 1);
2419     }
2420     if (dfp != -1) {
2421         /* Only override when we know what the host supports */
2422         alter_insns(&pcc->insns_flags2, PPC2_DFP, dfp);
2423     }
2424
2425     if (dcache_size != -1) {
2426         pcc->l1_dcache_size = dcache_size;
2427     }
2428
2429     if (icache_size != -1) {
2430         pcc->l1_icache_size = icache_size;
2431     }
2432
2433 #if defined(TARGET_PPC64)
2434     pcc->radix_page_info = kvm_get_radix_page_info();
2435
2436     if ((pcc->pvr & 0xffffff00) == CPU_POWERPC_POWER9_DD1) {
2437         /*
2438          * POWER9 DD1 has some bugs which make it not really ISA 3.00
2439          * compliant.  More importantly, advertising ISA 3.00
2440          * architected mode may prevent guests from activating
2441          * necessary DD1 workarounds.
2442          */
2443         pcc->pcr_supported &= ~(PCR_COMPAT_3_00 | PCR_COMPAT_2_07
2444                                 | PCR_COMPAT_2_06 | PCR_COMPAT_2_05);
2445     }
2446 #endif /* defined(TARGET_PPC64) */
2447 }
2448
2449 bool kvmppc_has_cap_epr(void)
2450 {
2451     return cap_epr;
2452 }
2453
2454 bool kvmppc_has_cap_fixup_hcalls(void)
2455 {
2456     return cap_fixup_hcalls;
2457 }
2458
2459 bool kvmppc_has_cap_htm(void)
2460 {
2461     return cap_htm;
2462 }
2463
2464 bool kvmppc_has_cap_mmu_radix(void)
2465 {
2466     return cap_mmu_radix;
2467 }
2468
2469 bool kvmppc_has_cap_mmu_hash_v3(void)
2470 {
2471     return cap_mmu_hash_v3;
2472 }
2473
2474 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
2475 {
2476     uint32_t host_pvr = mfpvr();
2477     PowerPCCPUClass *pvr_pcc;
2478
2479     pvr_pcc = ppc_cpu_class_by_pvr(host_pvr);
2480     if (pvr_pcc == NULL) {
2481         pvr_pcc = ppc_cpu_class_by_pvr_mask(host_pvr);
2482     }
2483
2484     return pvr_pcc;
2485 }
2486
2487 static int kvm_ppc_register_host_cpu_type(MachineState *ms)
2488 {
2489     TypeInfo type_info = {
2490         .name = TYPE_HOST_POWERPC_CPU,
2491         .class_init = kvmppc_host_cpu_class_init,
2492     };
2493     MachineClass *mc = MACHINE_GET_CLASS(ms);
2494     PowerPCCPUClass *pvr_pcc;
2495     ObjectClass *oc;
2496     DeviceClass *dc;
2497     int i;
2498
2499     pvr_pcc = kvm_ppc_get_host_cpu_class();
2500     if (pvr_pcc == NULL) {
2501         return -1;
2502     }
2503     type_info.parent = object_class_get_name(OBJECT_CLASS(pvr_pcc));
2504     type_register(&type_info);
2505     if (object_dynamic_cast(OBJECT(ms), TYPE_SPAPR_MACHINE)) {
2506         /* override TCG default cpu type with 'host' cpu model */
2507         mc->default_cpu_type = TYPE_HOST_POWERPC_CPU;
2508     }
2509
2510     oc = object_class_by_name(type_info.name);
2511     g_assert(oc);
2512
2513     /*
2514      * Update generic CPU family class alias (e.g. on a POWER8NVL host,
2515      * we want "POWER8" to be a "family" alias that points to the current
2516      * host CPU type, too)
2517      */
2518     dc = DEVICE_CLASS(ppc_cpu_get_family_class(pvr_pcc));
2519     for (i = 0; ppc_cpu_aliases[i].alias != NULL; i++) {
2520         if (strcasecmp(ppc_cpu_aliases[i].alias, dc->desc) == 0) {
2521             char *suffix;
2522
2523             ppc_cpu_aliases[i].model = g_strdup(object_class_get_name(oc));
2524             suffix = strstr(ppc_cpu_aliases[i].model, POWERPC_CPU_TYPE_SUFFIX);
2525             if (suffix) {
2526                 *suffix = 0;
2527             }
2528             break;
2529         }
2530     }
2531
2532     return 0;
2533 }
2534
2535 int kvmppc_define_rtas_kernel_token(uint32_t token, const char *function)
2536 {
2537     struct kvm_rtas_token_args args = {
2538         .token = token,
2539     };
2540
2541     if (!kvm_check_extension(kvm_state, KVM_CAP_PPC_RTAS)) {
2542         return -ENOENT;
2543     }
2544
2545     strncpy(args.name, function, sizeof(args.name));
2546
2547     return kvm_vm_ioctl(kvm_state, KVM_PPC_RTAS_DEFINE_TOKEN, &args);
2548 }
2549
2550 int kvmppc_get_htab_fd(bool write, uint64_t index, Error **errp)
2551 {
2552     struct kvm_get_htab_fd s = {
2553         .flags = write ? KVM_GET_HTAB_WRITE : 0,
2554         .start_index = index,
2555     };
2556     int ret;
2557
2558     if (!cap_htab_fd) {
2559         error_setg(errp, "KVM version doesn't support %s the HPT",
2560                    write ? "writing" : "reading");
2561         return -ENOTSUP;
2562     }
2563
2564     ret = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &s);
2565     if (ret < 0) {
2566         error_setg(errp, "Unable to open fd for %s HPT %s KVM: %s",
2567                    write ? "writing" : "reading", write ? "to" : "from",
2568                    strerror(errno));
2569         return -errno;
2570     }
2571
2572     return ret;
2573 }
2574
2575 int kvmppc_save_htab(QEMUFile *f, int fd, size_t bufsize, int64_t max_ns)
2576 {
2577     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2578     uint8_t buf[bufsize];
2579     ssize_t rc;
2580
2581     do {
2582         rc = read(fd, buf, bufsize);
2583         if (rc < 0) {
2584             fprintf(stderr, "Error reading data from KVM HTAB fd: %s\n",
2585                     strerror(errno));
2586             return rc;
2587         } else if (rc) {
2588             uint8_t *buffer = buf;
2589             ssize_t n = rc;
2590             while (n) {
2591                 struct kvm_get_htab_header *head =
2592                     (struct kvm_get_htab_header *) buffer;
2593                 size_t chunksize = sizeof(*head) +
2594                      HASH_PTE_SIZE_64 * head->n_valid;
2595
2596                 qemu_put_be32(f, head->index);
2597                 qemu_put_be16(f, head->n_valid);
2598                 qemu_put_be16(f, head->n_invalid);
2599                 qemu_put_buffer(f, (void *)(head + 1),
2600                                 HASH_PTE_SIZE_64 * head->n_valid);
2601
2602                 buffer += chunksize;
2603                 n -= chunksize;
2604             }
2605         }
2606     } while ((rc != 0)
2607              && ((max_ns < 0)
2608                  || ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) < max_ns)));
2609
2610     return (rc == 0) ? 1 : 0;
2611 }
2612
2613 int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
2614                            uint16_t n_valid, uint16_t n_invalid)
2615 {
2616     struct kvm_get_htab_header *buf;
2617     size_t chunksize = sizeof(*buf) + n_valid*HASH_PTE_SIZE_64;
2618     ssize_t rc;
2619
2620     buf = alloca(chunksize);
2621     buf->index = index;
2622     buf->n_valid = n_valid;
2623     buf->n_invalid = n_invalid;
2624
2625     qemu_get_buffer(f, (void *)(buf + 1), HASH_PTE_SIZE_64*n_valid);
2626
2627     rc = write(fd, buf, chunksize);
2628     if (rc < 0) {
2629         fprintf(stderr, "Error writing KVM hash table: %s\n",
2630                 strerror(errno));
2631         return rc;
2632     }
2633     if (rc != chunksize) {
2634         /* We should never get a short write on a single chunk */
2635         fprintf(stderr, "Short write, restoring KVM hash table\n");
2636         return -1;
2637     }
2638     return 0;
2639 }
2640
2641 bool kvm_arch_stop_on_emulation_error(CPUState *cpu)
2642 {
2643     return true;
2644 }
2645
2646 void kvm_arch_init_irq_routing(KVMState *s)
2647 {
2648 }
2649
2650 void kvmppc_read_hptes(ppc_hash_pte64_t *hptes, hwaddr ptex, int n)
2651 {
2652     int fd, rc;
2653     int i;
2654
2655     fd = kvmppc_get_htab_fd(false, ptex, &error_abort);
2656
2657     i = 0;
2658     while (i < n) {
2659         struct kvm_get_htab_header *hdr;
2660         int m = n < HPTES_PER_GROUP ? n : HPTES_PER_GROUP;
2661         char buf[sizeof(*hdr) + m * HASH_PTE_SIZE_64];
2662
2663         rc = read(fd, buf, sizeof(buf));
2664         if (rc < 0) {
2665             hw_error("kvmppc_read_hptes: Unable to read HPTEs");
2666         }
2667
2668         hdr = (struct kvm_get_htab_header *)buf;
2669         while ((i < n) && ((char *)hdr < (buf + rc))) {
2670             int invalid = hdr->n_invalid;
2671
2672             if (hdr->index != (ptex + i)) {
2673                 hw_error("kvmppc_read_hptes: Unexpected HPTE index %"PRIu32
2674                          " != (%"HWADDR_PRIu" + %d", hdr->index, ptex, i);
2675             }
2676
2677             memcpy(hptes + i, hdr + 1, HASH_PTE_SIZE_64 * hdr->n_valid);
2678             i += hdr->n_valid;
2679
2680             if ((n - i) < invalid) {
2681                 invalid = n - i;
2682             }
2683             memset(hptes + i, 0, invalid * HASH_PTE_SIZE_64);
2684             i += hdr->n_invalid;
2685
2686             hdr = (struct kvm_get_htab_header *)
2687                 ((char *)(hdr + 1) + HASH_PTE_SIZE_64 * hdr->n_valid);
2688         }
2689     }
2690
2691     close(fd);
2692 }
2693
2694 void kvmppc_write_hpte(hwaddr ptex, uint64_t pte0, uint64_t pte1)
2695 {
2696     int fd, rc;
2697     struct {
2698         struct kvm_get_htab_header hdr;
2699         uint64_t pte0;
2700         uint64_t pte1;
2701     } buf;
2702
2703     fd = kvmppc_get_htab_fd(true, 0 /* Ignored */, &error_abort);
2704
2705     buf.hdr.n_valid = 1;
2706     buf.hdr.n_invalid = 0;
2707     buf.hdr.index = ptex;
2708     buf.pte0 = cpu_to_be64(pte0);
2709     buf.pte1 = cpu_to_be64(pte1);
2710
2711     rc = write(fd, &buf, sizeof(buf));
2712     if (rc != sizeof(buf)) {
2713         hw_error("kvmppc_write_hpte: Unable to update KVM HPT");
2714     }
2715     close(fd);
2716 }
2717
2718 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
2719                              uint64_t address, uint32_t data, PCIDevice *dev)
2720 {
2721     return 0;
2722 }
2723
2724 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
2725                                 int vector, PCIDevice *dev)
2726 {
2727     return 0;
2728 }
2729
2730 int kvm_arch_release_virq_post(int virq)
2731 {
2732     return 0;
2733 }
2734
2735 int kvm_arch_msi_data_to_gsi(uint32_t data)
2736 {
2737     return data & 0xffff;
2738 }
2739
2740 int kvmppc_enable_hwrng(void)
2741 {
2742     if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) {
2743         return -1;
2744     }
2745
2746     return kvmppc_enable_hcall(kvm_state, H_RANDOM);
2747 }
2748
2749 void kvmppc_check_papr_resize_hpt(Error **errp)
2750 {
2751     if (!kvm_enabled()) {
2752         return; /* No KVM, we're good */
2753     }
2754
2755     if (cap_resize_hpt) {
2756         return; /* Kernel has explicit support, we're good */
2757     }
2758
2759     /* Otherwise fallback on looking for PR KVM */
2760     if (kvmppc_is_pr(kvm_state)) {
2761         return;
2762     }
2763
2764     error_setg(errp,
2765                "Hash page table resizing not available with this KVM version");
2766 }
2767
2768 int kvmppc_resize_hpt_prepare(PowerPCCPU *cpu, target_ulong flags, int shift)
2769 {
2770     CPUState *cs = CPU(cpu);
2771     struct kvm_ppc_resize_hpt rhpt = {
2772         .flags = flags,
2773         .shift = shift,
2774     };
2775
2776     if (!cap_resize_hpt) {
2777         return -ENOSYS;
2778     }
2779
2780     return kvm_vm_ioctl(cs->kvm_state, KVM_PPC_RESIZE_HPT_PREPARE, &rhpt);
2781 }
2782
2783 int kvmppc_resize_hpt_commit(PowerPCCPU *cpu, target_ulong flags, int shift)
2784 {
2785     CPUState *cs = CPU(cpu);
2786     struct kvm_ppc_resize_hpt rhpt = {
2787         .flags = flags,
2788         .shift = shift,
2789     };
2790
2791     if (!cap_resize_hpt) {
2792         return -ENOSYS;
2793     }
2794
2795     return kvm_vm_ioctl(cs->kvm_state, KVM_PPC_RESIZE_HPT_COMMIT, &rhpt);
2796 }
2797
2798 /*
2799  * This is a helper function to detect a post migration scenario
2800  * in which a guest, running as KVM-HV, freezes in cpu_post_load because
2801  * the guest kernel can't handle a PVR value other than the actual host
2802  * PVR in KVM_SET_SREGS, even if pvr_match() returns true.
2803  *
2804  * If we don't have cap_ppc_pvr_compat and we're not running in PR
2805  * (so, we're HV), return true. The workaround itself is done in
2806  * cpu_post_load.
2807  *
2808  * The order here is important: we'll only check for KVM PR as a
2809  * fallback if the guest kernel can't handle the situation itself.
2810  * We need to avoid as much as possible querying the running KVM type
2811  * in QEMU level.
2812  */
2813 bool kvmppc_pvr_workaround_required(PowerPCCPU *cpu)
2814 {
2815     CPUState *cs = CPU(cpu);
2816
2817     if (!kvm_enabled()) {
2818         return false;
2819     }
2820
2821     if (cap_ppc_pvr_compat) {
2822         return false;
2823     }
2824
2825     return !kvmppc_is_pr(cs->kvm_state);
2826 }