spapr: Limit threads per core according to current compatibility mode

[qemu.git] / target-i386 / kvm.c
diff --git a/target-i386/kvm.c b/target-i386/kvm.c

index 7522e980721ac1056547a2b7c02af31af4afddfb..4bf0ac9e76a082e43a1c58c86c5431ee01e16d93 100644 (file)
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -30,6 +30,8 @@
  #include "qemu/config-file.h"
  #include "hw/i386/pc.h"
  #include "hw/i386/apic.h"
+#include "hw/i386/apic_internal.h"
+#include "hw/i386/apic-msidef.h"
  #include "exec/ioport.h"
  #include <asm/hyperv.h>
  #include "hw/pci/pci.h"
@@ -69,8 +71,12 @@ static bool has_msr_feature_control;
  static bool has_msr_async_pf_en;
  static bool has_msr_pv_eoi_en;
  static bool has_msr_misc_enable;
+static bool has_msr_bndcfgs;
  static bool has_msr_kvm_steal_time;
  static int lm_capable_kernel;
+static bool has_msr_hv_hypercall;
+static bool has_msr_hv_vapic;
+static bool has_msr_hv_tsc;
  
  static bool has_msr_architectural_pmu;
  static uint32_t num_architectural_pmu_counters;
@@ -118,7 +124,7 @@ static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
      return cpuid;
  }
  
-struct kvm_para_features {
+static const struct kvm_para_features {
      int cap;
      int feature;
  } para_features[] = {
@@ -126,14 +132,13 @@ struct kvm_para_features {
      { KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
      { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
      { KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
-    { -1, -1 }
  };
  
  static int get_para_features(KVMState *s)
  {
      int i, features = 0;
  
-    for (i = 0; i < ARRAY_SIZE(para_features) - 1; i++) {
+    for (i = 0; i < ARRAY_SIZE(para_features); i++) {
          if (kvm_check_extension(s, para_features[i].cap)) {
              features |= (1 << para_features[i].feature);
          }
@@ -436,8 +441,11 @@ static bool hyperv_hypercall_available(X86CPU *cpu)
  
  static bool hyperv_enabled(X86CPU *cpu)
  {
-    return hyperv_hypercall_available(cpu) ||
-           cpu->hyperv_relaxed_timing;
+    CPUState *cs = CPU(cpu);
+    return kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0 &&
+           (hyperv_hypercall_available(cpu) ||
+            cpu->hyperv_time  ||
+            cpu->hyperv_relaxed_timing);
  }
  
  #define KVM_MAX_CPUID_ENTRIES  100
@@ -454,6 +462,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
      uint32_t unused;
      struct kvm_cpuid_entry2 *c;
      uint32_t signature[3];
+    int kvm_base = KVM_CPUID_SIGNATURE;
      int r;
  
      memset(&cpuid_data, 0, sizeof(cpuid_data));
@@ -461,26 +470,22 @@ int kvm_arch_init_vcpu(CPUState *cs)
      cpuid_i = 0;
  
      /* Paravirtualization CPUIDs */
-    c = &cpuid_data.entries[cpuid_i++];
-    c->function = KVM_CPUID_SIGNATURE;
-    if (!hyperv_enabled(cpu)) {
-        memcpy(signature, "KVMKVMKVM\0\0\0", 12);
-        c->eax = 0;
-    } else {
+    if (hyperv_enabled(cpu)) {
+        c = &cpuid_data.entries[cpuid_i++];
+        c->function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
          memcpy(signature, "Microsoft Hv", 12);
          c->eax = HYPERV_CPUID_MIN;
-    }
-    c->ebx = signature[0];
-    c->ecx = signature[1];
-    c->edx = signature[2];
-
-    c = &cpuid_data.entries[cpuid_i++];
-    c->function = KVM_CPUID_FEATURES;
-    c->eax = env->features[FEAT_KVM];
+        c->ebx = signature[0];
+        c->ecx = signature[1];
+        c->edx = signature[2];
  
-    if (hyperv_enabled(cpu)) {
+        c = &cpuid_data.entries[cpuid_i++];
+        c->function = HYPERV_CPUID_INTERFACE;
          memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
          c->eax = signature[0];
+        c->ebx = 0;
+        c->ecx = 0;
+        c->edx = 0;
  
          c = &cpuid_data.entries[cpuid_i++];
          c->function = HYPERV_CPUID_VERSION;
@@ -495,14 +500,21 @@ int kvm_arch_init_vcpu(CPUState *cs)
          if (cpu->hyperv_vapic) {
              c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
              c->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE;
+            has_msr_hv_vapic = true;
+        }
+        if (cpu->hyperv_time &&
+            kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
+            c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
+            c->eax |= HV_X64_MSR_TIME_REF_COUNT_AVAILABLE;
+            c->eax |= 0x200;
+            has_msr_hv_tsc = true;
          }
-
          c = &cpuid_data.entries[cpuid_i++];
          c->function = HYPERV_CPUID_ENLIGHTMENT_INFO;
          if (cpu->hyperv_relaxed_timing) {
              c->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
          }
-        if (cpu->hyperv_vapic) {
+        if (has_msr_hv_vapic) {
              c->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
          }
          c->ebx = cpu->hyperv_spinlock_attempts;
@@ -512,20 +524,29 @@ int kvm_arch_init_vcpu(CPUState *cs)
          c->eax = 0x40;
          c->ebx = 0x40;
  
-        c = &cpuid_data.entries[cpuid_i++];
-        c->function = KVM_CPUID_SIGNATURE_NEXT;
+        kvm_base = KVM_CPUID_SIGNATURE_NEXT;
+        has_msr_hv_hypercall = true;
+    }
+
+    if (cpu->expose_kvm) {
          memcpy(signature, "KVMKVMKVM\0\0\0", 12);
-        c->eax = 0;
+        c = &cpuid_data.entries[cpuid_i++];
+        c->function = KVM_CPUID_SIGNATURE | kvm_base;
+        c->eax = KVM_CPUID_FEATURES | kvm_base;
          c->ebx = signature[0];
          c->ecx = signature[1];
          c->edx = signature[2];
-    }
  
-    has_msr_async_pf_en = c->eax & (1 << KVM_FEATURE_ASYNC_PF);
+        c = &cpuid_data.entries[cpuid_i++];
+        c->function = KVM_CPUID_FEATURES | kvm_base;
+        c->eax = env->features[FEAT_KVM];
  
-    has_msr_pv_eoi_en = c->eax & (1 << KVM_FEATURE_PV_EOI);
+        has_msr_async_pf_en = c->eax & (1 << KVM_FEATURE_ASYNC_PF);
  
-    has_msr_kvm_steal_time = c->eax & (1 << KVM_FEATURE_STEAL_TIME);
+        has_msr_pv_eoi_en = c->eax & (1 << KVM_FEATURE_PV_EOI);
+
+        has_msr_kvm_steal_time = c->eax & (1 << KVM_FEATURE_STEAL_TIME);
+    }
  
      cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
  
@@ -706,9 +727,8 @@ int kvm_arch_init_vcpu(CPUState *cs)
      return 0;
  }
  
-void kvm_arch_reset_vcpu(CPUState *cs)
+void kvm_arch_reset_vcpu(X86CPU *cpu)
  {
-    X86CPU *cpu = X86_CPU(cs);
      CPUX86State *env = &cpu->env;
  
      env->exception_injected = -1;
@@ -722,6 +742,16 @@ void kvm_arch_reset_vcpu(CPUState *cs)
      }
  }
  
+void kvm_arch_do_init_vcpu(X86CPU *cpu)
+{
+    CPUX86State *env = &cpu->env;
+
+    /* APs get directly into wait-for-SIPI state.  */
+    if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
+        env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
+    }
+}
+
  static int kvm_get_supported_msrs(KVMState *s)
  {
      static int kvm_supported_msrs;
@@ -772,6 +802,10 @@ static int kvm_get_supported_msrs(KVMState *s)
                      has_msr_misc_enable = true;
                      continue;
                  }
+                if (kvm_msr_list->indices[i] == MSR_IA32_BNDCFGS) {
+                    has_msr_bndcfgs = true;
+                    continue;
+                }
              }
          }
  
@@ -975,6 +1009,8 @@ static int kvm_put_fpu(X86CPU *cpu)
  #define XSAVE_XMM_SPACE   40
  #define XSAVE_XSTATE_BV   128
  #define XSAVE_YMMH_SPACE  144
+#define XSAVE_BNDREGS     240
+#define XSAVE_BNDCSR      256
  
  static int kvm_put_xsave(X86CPU *cpu)
  {
@@ -1007,6 +1043,10 @@ static int kvm_put_xsave(X86CPU *cpu)
      *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV] = env->xstate_bv;
      memcpy(&xsave->region[XSAVE_YMMH_SPACE], env->ymmh_regs,
              sizeof env->ymmh_regs);
+    memcpy(&xsave->region[XSAVE_BNDREGS], env->bnd_regs,
+            sizeof env->bnd_regs);
+    memcpy(&xsave->region[XSAVE_BNDCSR], &env->bndcs_regs,
+            sizeof(env->bndcs_regs));
      r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
      return r;
  }
@@ -1104,6 +1144,25 @@ static int kvm_put_tscdeadline_msr(X86CPU *cpu)
      return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
  }
  
+/*
+ * Provide a separate write service for the feature control MSR in order to
+ * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
+ * before writing any other state because forcibly leaving nested mode
+ * invalidates the VCPU state.
+ */
+static int kvm_put_msr_feature_control(X86CPU *cpu)
+{
+    struct {
+        struct kvm_msrs info;
+        struct kvm_msr_entry entry;
+    } msr_data;
+
+    kvm_msr_entry_set(&msr_data.entry, MSR_IA32_FEATURE_CONTROL,
+                      cpu->env.msr_ia32_feature_control);
+    msr_data.info.nmsrs = 1;
+    return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
+}
+
  static int kvm_put_msrs(X86CPU *cpu, int level)
  {
      CPUX86State *env = &cpu->env;
@@ -1131,6 +1190,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
          kvm_msr_entry_set(&msrs[n++], MSR_IA32_MISC_ENABLE,
                            env->msr_ia32_misc_enable);
      }
+    if (has_msr_bndcfgs) {
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
+    }
  #ifdef TARGET_X86_64
      if (lm_capable_kernel) {
          kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
@@ -1139,22 +1201,12 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
          kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
      }
  #endif
-    if (level == KVM_PUT_FULL_STATE) {
-        /*
-         * KVM is yet unable to synchronize TSC values of multiple VCPUs on
-         * writeback. Until this is fixed, we only write the offset to SMP
-         * guests after migration, desynchronizing the VCPUs, but avoiding
-         * huge jump-backs that would occur without any writeback at all.
-         */
-        if (smp_cpus == 1 || env->tsc != 0) {
-            kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
-        }
-    }
      /*
       * The following MSRs have side effects on the guest or are too heavy
       * for normal writeback. Limit them to reset or full state updates.
       */
      if (level >= KVM_PUT_RESET_STATE) {
+        kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
          kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
                            env->system_time_msr);
          kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
@@ -1197,17 +1249,23 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
              kvm_msr_entry_set(&msrs[n++], MSR_CORE_PERF_GLOBAL_CTRL,
                                env->msr_global_ctrl);
          }
-        if (hyperv_hypercall_available(cpu)) {
-            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_GUEST_OS_ID, 0);
-            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_HYPERCALL, 0);
+        if (has_msr_hv_hypercall) {
+            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_GUEST_OS_ID,
+                              env->msr_hv_guest_os_id);
+            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_HYPERCALL,
+                              env->msr_hv_hypercall);
          }
-        if (cpu->hyperv_vapic) {
-            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE, 0);
+        if (has_msr_hv_vapic) {
+            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE,
+                              env->msr_hv_vapic);
          }
-        if (has_msr_feature_control) {
-            kvm_msr_entry_set(&msrs[n++], MSR_IA32_FEATURE_CONTROL,
-                              env->msr_ia32_feature_control);
+        if (has_msr_hv_tsc) {
+            kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_REFERENCE_TSC,
+                              env->msr_hv_tsc);
          }
+
+        /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
+         *       kvm_put_msr_feature_control. */
      }
      if (env->mcg_cap) {
          int i;
@@ -1289,6 +1347,10 @@ static int kvm_get_xsave(X86CPU *cpu)
      env->xstate_bv = *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV];
      memcpy(env->ymmh_regs, &xsave->region[XSAVE_YMMH_SPACE],
              sizeof env->ymmh_regs);
+    memcpy(env->bnd_regs, &xsave->region[XSAVE_BNDREGS],
+            sizeof env->bnd_regs);
+    memcpy(&env->bndcs_regs, &xsave->region[XSAVE_BNDCSR],
+            sizeof(env->bndcs_regs));
      return 0;
  }
  
@@ -1370,7 +1432,7 @@ static int kvm_get_sregs(X86CPU *cpu)
         HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
         HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
  
-    hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
+    hflags = (env->segs[R_SS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
      hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
      hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
                  (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
@@ -1435,6 +1497,9 @@ static int kvm_get_msrs(X86CPU *cpu)
      if (has_msr_feature_control) {
          msrs[n++].index = MSR_IA32_FEATURE_CONTROL;
      }
+    if (has_msr_bndcfgs) {
+        msrs[n++].index = MSR_IA32_BNDCFGS;
+    }
  
      if (!env->tsc_valid) {
          msrs[n++].index = MSR_IA32_TSC;
@@ -1482,6 +1547,17 @@ static int kvm_get_msrs(X86CPU *cpu)
          }
      }
  
+    if (has_msr_hv_hypercall) {
+        msrs[n++].index = HV_X64_MSR_HYPERCALL;
+        msrs[n++].index = HV_X64_MSR_GUEST_OS_ID;
+    }
+    if (has_msr_hv_vapic) {
+        msrs[n++].index = HV_X64_MSR_APIC_ASSIST_PAGE;
+    }
+    if (has_msr_hv_tsc) {
+        msrs[n++].index = HV_X64_MSR_REFERENCE_TSC;
+    }
+
      msr_data.info.nmsrs = n;
      ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
      if (ret < 0) {
@@ -1550,6 +1626,9 @@ static int kvm_get_msrs(X86CPU *cpu)
          case MSR_IA32_FEATURE_CONTROL:
              env->msr_ia32_feature_control = msrs[i].data;
              break;
+        case MSR_IA32_BNDCFGS:
+            env->msr_bndcfgs = msrs[i].data;
+            break;
          default:
              if (msrs[i].index >= MSR_MC0_CTL &&
                  msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
@@ -1586,6 +1665,18 @@ static int kvm_get_msrs(X86CPU *cpu)
          case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
              env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
              break;
+        case HV_X64_MSR_HYPERCALL:
+            env->msr_hv_hypercall = msrs[i].data;
+            break;
+        case HV_X64_MSR_GUEST_OS_ID:
+            env->msr_hv_guest_os_id = msrs[i].data;
+            break;
+        case HV_X64_MSR_APIC_ASSIST_PAGE:
+            env->msr_hv_vapic = msrs[i].data;
+            break;
+        case HV_X64_MSR_REFERENCE_TSC:
+            env->msr_hv_tsc = msrs[i].data;
+            break;
          }
      }
  
@@ -1799,6 +1890,13 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
  
      assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
  
+    if (level >= KVM_PUT_RESET_STATE && has_msr_feature_control) {
+        ret = kvm_put_msr_feature_control(x86_cpu);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
      ret = kvm_getput_regs(x86_cpu, 1);
      if (ret < 0) {
          return ret;
@@ -1919,14 +2017,15 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
          }
      }
  
-    if (!kvm_irqchip_in_kernel()) {
-        /* Force the VCPU out of its inner loop to process any INIT requests
-         * or pending TPR access reports. */
-        if (cpu->interrupt_request &
-            (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
-            cpu->exit_request = 1;
-        }
+    /* Force the VCPU out of its inner loop to process any INIT requests
+     * or (for userspace APIC, but it is cheap to combine the checks here)
+     * pending TPR access reports.
+     */
+    if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
+        cpu->exit_request = 1;
+    }
  
+    if (!kvm_irqchip_in_kernel()) {
          /* Try to inject an interrupt if the guest can accept it */
          if (run->ready_for_interrupt_injection &&
              (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
@@ -2006,6 +2105,11 @@ int kvm_arch_process_async_events(CPUState *cs)
          }
      }
  
+    if (cs->interrupt_request & CPU_INTERRUPT_INIT) {
+        kvm_cpu_synchronize_state(cs);
+        do_cpu_init(cpu);
+    }
+
      if (kvm_irqchip_in_kernel()) {
          return 0;
      }
@@ -2019,10 +2123,6 @@ int kvm_arch_process_async_events(CPUState *cs)
          (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
          cs->halted = 0;
      }
-    if (cs->interrupt_request & CPU_INTERRUPT_INIT) {
-        kvm_cpu_synchronize_state(cs);
-        do_cpu_init(cpu);
-    }
      if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
          kvm_cpu_synchronize_state(cs);
          do_cpu_sipi(cpu);
@@ -2191,13 +2291,13 @@ static int kvm_handle_debug(X86CPU *cpu,
                          break;
                      case 0x1:
                          ret = EXCP_DEBUG;
-                        env->watchpoint_hit = &hw_watchpoint;
+                        cs->watchpoint_hit = &hw_watchpoint;
                          hw_watchpoint.vaddr = hw_breakpoint[n].addr;
                          hw_watchpoint.flags = BP_MEM_WRITE;
                          break;
                      case 0x3:
                          ret = EXCP_DEBUG;
-                        env->watchpoint_hit = &hw_watchpoint;
+                        cs->watchpoint_hit = &hw_watchpoint;
                          hw_watchpoint.vaddr = hw_breakpoint[n].addr;
                          hw_watchpoint.flags = BP_MEM_ACCESS;
                          break;
@@ -2205,11 +2305,11 @@ static int kvm_handle_debug(X86CPU *cpu,
                  }
              }
          }
-    } else if (kvm_find_sw_breakpoint(CPU(cpu), arch_info->pc)) {
+    } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
          ret = EXCP_DEBUG;
      }
      if (ret == 0) {
-        cpu_synchronize_state(CPU(cpu));
+        cpu_synchronize_state(cs);
          assert(env->exception_injected == -1);
  
          /* pass to guest */