arch/x86/kvm/vmx/vmx.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * Copyright (C) 2006 Qumranet, Inc.
   9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10  *
  11  * Authors:
  12  *   Avi Kivity   <[email protected]>
  13  *   Yaniv Kamay  <[email protected]>
  14  */
  15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  16
  17 #include <linux/highmem.h>
  18 #include <linux/hrtimer.h>
  19 #include <linux/kernel.h>
  20 #include <linux/kvm_host.h>
  21 #include <linux/module.h>
  22 #include <linux/moduleparam.h>
  23 #include <linux/mod_devicetable.h>
  24 #include <linux/mm.h>
  25 #include <linux/objtool.h>
  26 #include <linux/sched.h>
  27 #include <linux/sched/smt.h>
  28 #include <linux/slab.h>
  29 #include <linux/tboot.h>
  30 #include <linux/trace_events.h>
  31 #include <linux/entry-kvm.h>
  32
  33 #include <asm/apic.h>
  34 #include <asm/asm.h>
  35 #include <asm/cpu.h>
  36 #include <asm/cpu_device_id.h>
  37 #include <asm/debugreg.h>
  38 #include <asm/desc.h>
  39 #include <asm/fpu/api.h>
  40 #include <asm/fpu/xstate.h>
  41 #include <asm/fred.h>
  42 #include <asm/idtentry.h>
  43 #include <asm/io.h>
  44 #include <asm/irq_remapping.h>
  45 #include <asm/reboot.h>
  46 #include <asm/perf_event.h>
  47 #include <asm/mmu_context.h>
  48 #include <asm/mshyperv.h>
  49 #include <asm/mwait.h>
  50 #include <asm/spec-ctrl.h>
  51 #include <asm/vmx.h>
  52
  53 #include <trace/events/ipi.h>
  54
  55 #include "capabilities.h"
  56 #include "cpuid.h"
  57 #include "hyperv.h"
  58 #include "kvm_onhyperv.h"
  59 #include "irq.h"
  60 #include "kvm_cache_regs.h"
  61 #include "lapic.h"
  62 #include "mmu.h"
  63 #include "nested.h"
  64 #include "pmu.h"
  65 #include "sgx.h"
  66 #include "trace.h"
  67 #include "vmcs.h"
  68 #include "vmcs12.h"
  69 #include "vmx.h"
  70 #include "x86.h"
  71 #include "x86_ops.h"
  72 #include "smm.h"
  73 #include "vmx_onhyperv.h"
  74 #include "posted_intr.h"
  75
  76 MODULE_AUTHOR("Qumranet");
  77 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
  78 MODULE_LICENSE("GPL");
  79
  80 #ifdef MODULE
  81 static const struct x86_cpu_id vmx_cpu_id[] = {
  82         X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
  83         {}
  84 };
  85 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
  86 #endif
  87
  88 bool __read_mostly enable_vpid = 1;
  89 module_param_named(vpid, enable_vpid, bool, 0444);
  90
  91 static bool __read_mostly enable_vnmi = 1;
  92 module_param_named(vnmi, enable_vnmi, bool, 0444);
  93
  94 bool __read_mostly flexpriority_enabled = 1;
  95 module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
  96
  97 bool __read_mostly enable_ept = 1;
  98 module_param_named(ept, enable_ept, bool, 0444);
  99
 100 bool __read_mostly enable_unrestricted_guest = 1;
 101 module_param_named(unrestricted_guest,
 102                         enable_unrestricted_guest, bool, 0444);
 103
 104 bool __read_mostly enable_ept_ad_bits = 1;
 105 module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
 106
 107 static bool __read_mostly emulate_invalid_guest_state = true;
 108 module_param(emulate_invalid_guest_state, bool, 0444);
 109
 110 static bool __read_mostly fasteoi = 1;
 111 module_param(fasteoi, bool, 0444);
 112
 113 module_param(enable_apicv, bool, 0444);
 114
 115 bool __read_mostly enable_ipiv = true;
 116 module_param(enable_ipiv, bool, 0444);
 117
 118 /*
 119  * If nested=1, nested virtualization is supported, i.e., guests may use
 120  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
 121  * use VMX instructions.
 122  */
 123 static bool __read_mostly nested = 1;
 124 module_param(nested, bool, 0444);
 125
 126 bool __read_mostly enable_pml = 1;
 127 module_param_named(pml, enable_pml, bool, 0444);
 128
 129 static bool __read_mostly error_on_inconsistent_vmcs_config = true;
 130 module_param(error_on_inconsistent_vmcs_config, bool, 0444);
 131
 132 static bool __read_mostly dump_invalid_vmcs = 0;
 133 module_param(dump_invalid_vmcs, bool, 0644);
 134
 135 #define MSR_BITMAP_MODE_X2APIC          1
 136 #define MSR_BITMAP_MODE_X2APIC_APICV    2
 137
 138 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 139
 140 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
 141 static int __read_mostly cpu_preemption_timer_multi;
 142 static bool __read_mostly enable_preemption_timer = 1;
 143 #ifdef CONFIG_X86_64
 144 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 145 #endif
 146
 147 extern bool __read_mostly allow_smaller_maxphyaddr;
 148 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
 149
 150 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
 151 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
 152 #define KVM_VM_CR0_ALWAYS_ON                            \
 153         (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
 154
 155 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
 156 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
 157 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
 158
 159 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
 160
 161 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
 162         RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
 163         RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
 164         RTIT_STATUS_BYTECNT))
 165
 166 /*
 167  * List of MSRs that can be directly passed to the guest.
 168  * In addition to these x2apic, PT and LBR MSRs are handled specially.
 169  */
 170 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
 171         MSR_IA32_SPEC_CTRL,
 172         MSR_IA32_PRED_CMD,
 173         MSR_IA32_FLUSH_CMD,
 174         MSR_IA32_TSC,
 175 #ifdef CONFIG_X86_64
 176         MSR_FS_BASE,
 177         MSR_GS_BASE,
 178         MSR_KERNEL_GS_BASE,
 179         MSR_IA32_XFD,
 180         MSR_IA32_XFD_ERR,
 181 #endif
 182         MSR_IA32_SYSENTER_CS,
 183         MSR_IA32_SYSENTER_ESP,
 184         MSR_IA32_SYSENTER_EIP,
 185         MSR_CORE_C1_RES,
 186         MSR_CORE_C3_RESIDENCY,
 187         MSR_CORE_C6_RESIDENCY,
 188         MSR_CORE_C7_RESIDENCY,
 189 };
 190
 191 /*
 192  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 193  * ple_gap:    upper bound on the amount of time between two successive
 194  *             executions of PAUSE in a loop. Also indicate if ple enabled.
 195  *             According to test, this time is usually smaller than 128 cycles.
 196  * ple_window: upper bound on the amount of time a guest is allowed to execute
 197  *             in a PAUSE loop. Tests indicate that most spinlocks are held for
 198  *             less than 2^12 cycles
 199  * Time is measured based on a counter that runs at the same rate as the TSC,
 200  * refer SDM volume 3b section 21.6.13 & 22.1.3.
 201  */
 202 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
 203 module_param(ple_gap, uint, 0444);
 204
 205 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 206 module_param(ple_window, uint, 0444);
 207
 208 /* Default doubles per-vcpu window every exit. */
 209 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 210 module_param(ple_window_grow, uint, 0444);
 211
 212 /* Default resets per-vcpu window every exit to ple_window. */
 213 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 214 module_param(ple_window_shrink, uint, 0444);
 215
 216 /* Default is to compute the maximum so we can never overflow. */
 217 static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 218 module_param(ple_window_max, uint, 0444);
 219
 220 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
 221 int __read_mostly pt_mode = PT_MODE_SYSTEM;
 222 #ifdef CONFIG_BROKEN
 223 module_param(pt_mode, int, S_IRUGO);
 224 #endif
 225
 226 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
 227
 228 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 229 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
 230 static DEFINE_MUTEX(vmx_l1d_flush_mutex);
 231
 232 /* Storage for pre module init parameter parsing */
 233 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
 234
 235 static const struct {
 236         const char *option;
 237         bool for_parse;
 238 } vmentry_l1d_param[] = {
 239         [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
 240         [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
 241         [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
 242         [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
 243         [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
 244         [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
 245 };
 246
 247 #define L1D_CACHE_ORDER 4
 248 static void *vmx_l1d_flush_pages;
 249
 250 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 251 {
 252         struct page *page;
 253         unsigned int i;
 254
 255         if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
 256                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 257                 return 0;
 258         }
 259
 260         if (!enable_ept) {
 261                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
 262                 return 0;
 263         }
 264
 265         if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
 266                 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
 267                 return 0;
 268         }
 269
 270         /* If set to auto use the default l1tf mitigation method */
 271         if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
 272                 switch (l1tf_mitigation) {
 273                 case L1TF_MITIGATION_OFF:
 274                         l1tf = VMENTER_L1D_FLUSH_NEVER;
 275                         break;
 276                 case L1TF_MITIGATION_FLUSH_NOWARN:
 277                 case L1TF_MITIGATION_FLUSH:
 278                 case L1TF_MITIGATION_FLUSH_NOSMT:
 279                         l1tf = VMENTER_L1D_FLUSH_COND;
 280                         break;
 281                 case L1TF_MITIGATION_FULL:
 282                 case L1TF_MITIGATION_FULL_FORCE:
 283                         l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 284                         break;
 285                 }
 286         } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
 287                 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
 288         }
 289
 290         if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
 291             !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
 292                 /*
 293                  * This allocation for vmx_l1d_flush_pages is not tied to a VM
 294                  * lifetime and so should not be charged to a memcg.
 295                  */
 296                 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
 297                 if (!page)
 298                         return -ENOMEM;
 299                 vmx_l1d_flush_pages = page_address(page);
 300
 301                 /*
 302                  * Initialize each page with a different pattern in
 303                  * order to protect against KSM in the nested
 304                  * virtualization case.
 305                  */
 306                 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
 307                         memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
 308                                PAGE_SIZE);
 309                 }
 310         }
 311
 312         l1tf_vmx_mitigation = l1tf;
 313
 314         if (l1tf != VMENTER_L1D_FLUSH_NEVER)
 315                 static_branch_enable(&vmx_l1d_should_flush);
 316         else
 317                 static_branch_disable(&vmx_l1d_should_flush);
 318
 319         if (l1tf == VMENTER_L1D_FLUSH_COND)
 320                 static_branch_enable(&vmx_l1d_flush_cond);
 321         else
 322                 static_branch_disable(&vmx_l1d_flush_cond);
 323         return 0;
 324 }
 325
 326 static int vmentry_l1d_flush_parse(const char *s)
 327 {
 328         unsigned int i;
 329
 330         if (s) {
 331                 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
 332                         if (vmentry_l1d_param[i].for_parse &&
 333                             sysfs_streq(s, vmentry_l1d_param[i].option))
 334                                 return i;
 335                 }
 336         }
 337         return -EINVAL;
 338 }
 339
 340 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
 341 {
 342         int l1tf, ret;
 343
 344         l1tf = vmentry_l1d_flush_parse(s);
 345         if (l1tf < 0)
 346                 return l1tf;
 347
 348         if (!boot_cpu_has(X86_BUG_L1TF))
 349                 return 0;
 350
 351         /*
 352          * Has vmx_init() run already? If not then this is the pre init
 353          * parameter parsing. In that case just store the value and let
 354          * vmx_init() do the proper setup after enable_ept has been
 355          * established.
 356          */
 357         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
 358                 vmentry_l1d_flush_param = l1tf;
 359                 return 0;
 360         }
 361
 362         mutex_lock(&vmx_l1d_flush_mutex);
 363         ret = vmx_setup_l1d_flush(l1tf);
 364         mutex_unlock(&vmx_l1d_flush_mutex);
 365         return ret;
 366 }
 367
 368 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
 369 {
 370         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
 371                 return sysfs_emit(s, "???\n");
 372
 373         return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
 374 }
 375
 376 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
 377 {
 378         u64 msr;
 379
 380         if (!vmx->disable_fb_clear)
 381                 return;
 382
 383         msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
 384         msr |= FB_CLEAR_DIS;
 385         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
 386         /* Cache the MSR value to avoid reading it later */
 387         vmx->msr_ia32_mcu_opt_ctrl = msr;
 388 }
 389
 390 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
 391 {
 392         if (!vmx->disable_fb_clear)
 393                 return;
 394
 395         vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
 396         native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
 397 }
 398
 399 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
 400 {
 401         /*
 402          * Disable VERW's behavior of clearing CPU buffers for the guest if the
 403          * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
 404          * the mitigation. Disabling the clearing behavior provides a
 405          * performance boost for guests that aren't aware that manually clearing
 406          * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
 407          * and VM-Exit.
 408          */
 409         vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
 410                                 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
 411                                 !boot_cpu_has_bug(X86_BUG_MDS) &&
 412                                 !boot_cpu_has_bug(X86_BUG_TAA);
 413
 414         /*
 415          * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
 416          * at VMEntry. Skip the MSR read/write when a guest has no use case to
 417          * execute VERW.
 418          */
 419         if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
 420            ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
 421             (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
 422             (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
 423             (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
 424             (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
 425                 vmx->disable_fb_clear = false;
 426 }
 427
 428 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 429         .set = vmentry_l1d_flush_set,
 430         .get = vmentry_l1d_flush_get,
 431 };
 432 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 433
 434 static u32 vmx_segment_access_rights(struct kvm_segment *var);
 435
 436 void vmx_vmexit(void);
 437
 438 #define vmx_insn_failed(fmt...)         \
 439 do {                                    \
 440         WARN_ONCE(1, fmt);              \
 441         pr_warn_ratelimited(fmt);       \
 442 } while (0)
 443
 444 noinline void vmread_error(unsigned long field)
 445 {
 446         vmx_insn_failed("vmread failed: field=%lx\n", field);
 447 }
 448
 449 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 450 noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
 451 {
 452         if (fault) {
 453                 kvm_spurious_fault();
 454         } else {
 455                 instrumentation_begin();
 456                 vmread_error(field);
 457                 instrumentation_end();
 458         }
 459 }
 460 #endif
 461
 462 noinline void vmwrite_error(unsigned long field, unsigned long value)
 463 {
 464         vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
 465                         field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 466 }
 467
 468 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
 469 {
 470         vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
 471                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
 472 }
 473
 474 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
 475 {
 476         vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
 477                         vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
 478 }
 479
 480 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
 481 {
 482         vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
 483                         ext, vpid, gva);
 484 }
 485
 486 noinline void invept_error(unsigned long ext, u64 eptp)
 487 {
 488         vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
 489 }
 490
 491 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 492 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 493 /*
 494  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
 495  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
 496  */
 497 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 498
 499 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 500 static DEFINE_SPINLOCK(vmx_vpid_lock);
 501
 502 struct vmcs_config vmcs_config __ro_after_init;
 503 struct vmx_capability vmx_capability __ro_after_init;
 504
 505 #define VMX_SEGMENT_FIELD(seg)                                  \
 506         [VCPU_SREG_##seg] = {                                   \
 507                 .selector = GUEST_##seg##_SELECTOR,             \
 508                 .base = GUEST_##seg##_BASE,                     \
 509                 .limit = GUEST_##seg##_LIMIT,                   \
 510                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
 511         }
 512
 513 static const struct kvm_vmx_segment_field {
 514         unsigned selector;
 515         unsigned base;
 516         unsigned limit;
 517         unsigned ar_bytes;
 518 } kvm_vmx_segment_fields[] = {
 519         VMX_SEGMENT_FIELD(CS),
 520         VMX_SEGMENT_FIELD(DS),
 521         VMX_SEGMENT_FIELD(ES),
 522         VMX_SEGMENT_FIELD(FS),
 523         VMX_SEGMENT_FIELD(GS),
 524         VMX_SEGMENT_FIELD(SS),
 525         VMX_SEGMENT_FIELD(TR),
 526         VMX_SEGMENT_FIELD(LDTR),
 527 };
 528
 529
 530 static unsigned long host_idt_base;
 531
 532 #if IS_ENABLED(CONFIG_HYPERV)
 533 static bool __read_mostly enlightened_vmcs = true;
 534 module_param(enlightened_vmcs, bool, 0444);
 535
 536 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
 537 {
 538         struct hv_enlightened_vmcs *evmcs;
 539         hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
 540
 541         if (partition_assist_page == INVALID_PAGE)
 542                 return -ENOMEM;
 543
 544         evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
 545
 546         evmcs->partition_assist_page = partition_assist_page;
 547         evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
 548         evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
 549
 550         return 0;
 551 }
 552
 553 static __init void hv_init_evmcs(void)
 554 {
 555         int cpu;
 556
 557         if (!enlightened_vmcs)
 558                 return;
 559
 560         /*
 561          * Enlightened VMCS usage should be recommended and the host needs
 562          * to support eVMCS v1 or above.
 563          */
 564         if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
 565             (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
 566              KVM_EVMCS_VERSION) {
 567
 568                 /* Check that we have assist pages on all online CPUs */
 569                 for_each_online_cpu(cpu) {
 570                         if (!hv_get_vp_assist_page(cpu)) {
 571                                 enlightened_vmcs = false;
 572                                 break;
 573                         }
 574                 }
 575
 576                 if (enlightened_vmcs) {
 577                         pr_info("Using Hyper-V Enlightened VMCS\n");
 578                         static_branch_enable(&__kvm_is_using_evmcs);
 579                 }
 580
 581                 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
 582                         vt_x86_ops.enable_l2_tlb_flush
 583                                 = hv_enable_l2_tlb_flush;
 584         } else {
 585                 enlightened_vmcs = false;
 586         }
 587 }
 588
 589 static void hv_reset_evmcs(void)
 590 {
 591         struct hv_vp_assist_page *vp_ap;
 592
 593         if (!kvm_is_using_evmcs())
 594                 return;
 595
 596         /*
 597          * KVM should enable eVMCS if and only if all CPUs have a VP assist
 598          * page, and should reject CPU onlining if eVMCS is enabled the CPU
 599          * doesn't have a VP assist page allocated.
 600          */
 601         vp_ap = hv_get_vp_assist_page(smp_processor_id());
 602         if (WARN_ON_ONCE(!vp_ap))
 603                 return;
 604
 605         /*
 606          * Reset everything to support using non-enlightened VMCS access later
 607          * (e.g. when we reload the module with enlightened_vmcs=0)
 608          */
 609         vp_ap->nested_control.features.directhypercall = 0;
 610         vp_ap->current_nested_vmcs = 0;
 611         vp_ap->enlighten_vmentry = 0;
 612 }
 613
 614 #else /* IS_ENABLED(CONFIG_HYPERV) */
 615 static void hv_init_evmcs(void) {}
 616 static void hv_reset_evmcs(void) {}
 617 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 618
 619 /*
 620  * Comment's format: document - errata name - stepping - processor name.
 621  * Refer from
 622  * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
 623  */
 624 static u32 vmx_preemption_cpu_tfms[] = {
 625 /* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
 626 0x000206E6,
 627 /* 323056.pdf - AAX65  - C2 - Xeon L3406 */
 628 /* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
 629 /* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
 630 0x00020652,
 631 /* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
 632 0x00020655,
 633 /* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
 634 /* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
 635 /*
 636  * 320767.pdf - AAP86  - B1 -
 637  * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
 638  */
 639 0x000106E5,
 640 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
 641 0x000106A0,
 642 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
 643 0x000106A1,
 644 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
 645 0x000106A4,
 646  /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
 647  /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
 648  /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
 649 0x000106A5,
 650  /* Xeon E3-1220 V2 */
 651 0x000306A8,
 652 };
 653
 654 static inline bool cpu_has_broken_vmx_preemption_timer(void)
 655 {
 656         u32 eax = cpuid_eax(0x00000001), i;
 657
 658         /* Clear the reserved bits */
 659         eax &= ~(0x3U << 14 | 0xfU << 28);
 660         for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
 661                 if (eax == vmx_preemption_cpu_tfms[i])
 662                         return true;
 663
 664         return false;
 665 }
 666
 667 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 668 {
 669         return flexpriority_enabled && lapic_in_kernel(vcpu);
 670 }
 671
 672 static int vmx_get_passthrough_msr_slot(u32 msr)
 673 {
 674         int i;
 675
 676         switch (msr) {
 677         case 0x800 ... 0x8ff:
 678                 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
 679                 return -ENOENT;
 680         case MSR_IA32_RTIT_STATUS:
 681         case MSR_IA32_RTIT_OUTPUT_BASE:
 682         case MSR_IA32_RTIT_OUTPUT_MASK:
 683         case MSR_IA32_RTIT_CR3_MATCH:
 684         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
 685                 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
 686         case MSR_LBR_SELECT:
 687         case MSR_LBR_TOS:
 688         case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
 689         case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
 690         case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
 691         case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
 692         case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
 693                 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
 694                 return -ENOENT;
 695         }
 696
 697         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
 698                 if (vmx_possible_passthrough_msrs[i] == msr)
 699                         return i;
 700         }
 701
 702         WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
 703         return -ENOENT;
 704 }
 705
 706 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 707 {
 708         int i;
 709
 710         i = kvm_find_user_return_msr(msr);
 711         if (i >= 0)
 712                 return &vmx->guest_uret_msrs[i];
 713         return NULL;
 714 }
 715
 716 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
 717                                   struct vmx_uret_msr *msr, u64 data)
 718 {
 719         unsigned int slot = msr - vmx->guest_uret_msrs;
 720         int ret = 0;
 721
 722         if (msr->load_into_hardware) {
 723                 preempt_disable();
 724                 ret = kvm_set_user_return_msr(slot, data, msr->mask);
 725                 preempt_enable();
 726         }
 727         if (!ret)
 728                 msr->data = data;
 729         return ret;
 730 }
 731
 732 /*
 733  * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
 734  *
 735  * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
 736  * atomically track post-VMXON state, e.g. this may be called in NMI context.
 737  * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
 738  * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
 739  * magically in RM, VM86, compat mode, or at CPL>0.
 740  */
 741 static int kvm_cpu_vmxoff(void)
 742 {
 743         asm goto("1: vmxoff\n\t"
 744                           _ASM_EXTABLE(1b, %l[fault])
 745                           ::: "cc", "memory" : fault);
 746
 747         cr4_clear_bits(X86_CR4_VMXE);
 748         return 0;
 749
 750 fault:
 751         cr4_clear_bits(X86_CR4_VMXE);
 752         return -EIO;
 753 }
 754
 755 void vmx_emergency_disable_virtualization_cpu(void)
 756 {
 757         int cpu = raw_smp_processor_id();
 758         struct loaded_vmcs *v;
 759
 760         kvm_rebooting = true;
 761
 762         /*
 763          * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
 764          * set in task context.  If this races with VMX is disabled by an NMI,
 765          * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
 766          * kvm_rebooting set.
 767          */
 768         if (!(__read_cr4() & X86_CR4_VMXE))
 769                 return;
 770
 771         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
 772                             loaded_vmcss_on_cpu_link)
 773                 vmcs_clear(v->vmcs);
 774
 775         kvm_cpu_vmxoff();
 776 }
 777
 778 static void __loaded_vmcs_clear(void *arg)
 779 {
 780         struct loaded_vmcs *loaded_vmcs = arg;
 781         int cpu = raw_smp_processor_id();
 782
 783         if (loaded_vmcs->cpu != cpu)
 784                 return; /* vcpu migration can race with cpu offline */
 785         if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
 786                 per_cpu(current_vmcs, cpu) = NULL;
 787
 788         vmcs_clear(loaded_vmcs->vmcs);
 789         if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
 790                 vmcs_clear(loaded_vmcs->shadow_vmcs);
 791
 792         list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
 793
 794         /*
 795          * Ensure all writes to loaded_vmcs, including deleting it from its
 796          * current percpu list, complete before setting loaded_vmcs->cpu to
 797          * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
 798          * and add loaded_vmcs to its percpu list before it's deleted from this
 799          * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
 800          */
 801         smp_wmb();
 802
 803         loaded_vmcs->cpu = -1;
 804         loaded_vmcs->launched = 0;
 805 }
 806
 807 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 808 {
 809         int cpu = loaded_vmcs->cpu;
 810
 811         if (cpu != -1)
 812                 smp_call_function_single(cpu,
 813                          __loaded_vmcs_clear, loaded_vmcs, 1);
 814 }
 815
 816 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
 817                                        unsigned field)
 818 {
 819         bool ret;
 820         u32 mask = 1 << (seg * SEG_FIELD_NR + field);
 821
 822         if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
 823                 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
 824                 vmx->segment_cache.bitmask = 0;
 825         }
 826         ret = vmx->segment_cache.bitmask & mask;
 827         vmx->segment_cache.bitmask |= mask;
 828         return ret;
 829 }
 830
 831 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
 832 {
 833         u16 *p = &vmx->segment_cache.seg[seg].selector;
 834
 835         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
 836                 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
 837         return *p;
 838 }
 839
 840 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
 841 {
 842         ulong *p = &vmx->segment_cache.seg[seg].base;
 843
 844         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
 845                 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
 846         return *p;
 847 }
 848
 849 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
 850 {
 851         u32 *p = &vmx->segment_cache.seg[seg].limit;
 852
 853         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
 854                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
 855         return *p;
 856 }
 857
 858 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
 859 {
 860         u32 *p = &vmx->segment_cache.seg[seg].ar;
 861
 862         if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
 863                 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
 864         return *p;
 865 }
 866
 867 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
 868 {
 869         u32 eb;
 870
 871         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 872              (1u << DB_VECTOR) | (1u << AC_VECTOR);
 873         /*
 874          * #VE isn't used for VMX.  To test against unexpected changes
 875          * related to #VE for VMX, intercept unexpected #VE and warn on it.
 876          */
 877         if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
 878                 eb |= 1u << VE_VECTOR;
 879         /*
 880          * Guest access to VMware backdoor ports could legitimately
 881          * trigger #GP because of TSS I/O permission bitmap.
 882          * We intercept those #GP and allow access to them anyway
 883          * as VMware does.
 884          */
 885         if (enable_vmware_backdoor)
 886                 eb |= (1u << GP_VECTOR);
 887         if ((vcpu->guest_debug &
 888              (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
 889             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
 890                 eb |= 1u << BP_VECTOR;
 891         if (to_vmx(vcpu)->rmode.vm86_active)
 892                 eb = ~0;
 893         if (!vmx_need_pf_intercept(vcpu))
 894                 eb &= ~(1u << PF_VECTOR);
 895
 896         /* When we are running a nested L2 guest and L1 specified for it a
 897          * certain exception bitmap, we must trap the same exceptions and pass
 898          * them to L1. When running L2, we will only handle the exceptions
 899          * specified above if L1 did not want them.
 900          */
 901         if (is_guest_mode(vcpu))
 902                 eb |= get_vmcs12(vcpu)->exception_bitmap;
 903         else {
 904                 int mask = 0, match = 0;
 905
 906                 if (enable_ept && (eb & (1u << PF_VECTOR))) {
 907                         /*
 908                          * If EPT is enabled, #PF is currently only intercepted
 909                          * if MAXPHYADDR is smaller on the guest than on the
 910                          * host.  In that case we only care about present,
 911                          * non-reserved faults.  For vmcs02, however, PFEC_MASK
 912                          * and PFEC_MATCH are set in prepare_vmcs02_rare.
 913                          */
 914                         mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
 915                         match = PFERR_PRESENT_MASK;
 916                 }
 917                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
 918                 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
 919         }
 920
 921         /*
 922          * Disabling xfd interception indicates that dynamic xfeatures
 923          * might be used in the guest. Always trap #NM in this case
 924          * to save guest xfd_err timely.
 925          */
 926         if (vcpu->arch.xfd_no_write_intercept)
 927                 eb |= (1u << NM_VECTOR);
 928
 929         vmcs_write32(EXCEPTION_BITMAP, eb);
 930 }
 931
 932 /*
 933  * Check if MSR is intercepted for currently loaded MSR bitmap.
 934  */
 935 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
 936 {
 937         if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
 938                 return true;
 939
 940         return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
 941 }
 942
 943 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
 944 {
 945         unsigned int flags = 0;
 946
 947         if (vmx->loaded_vmcs->launched)
 948                 flags |= VMX_RUN_VMRESUME;
 949
 950         /*
 951          * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
 952          * to change it directly without causing a vmexit.  In that case read
 953          * it after vmexit and store it in vmx->spec_ctrl.
 954          */
 955         if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
 956                 flags |= VMX_RUN_SAVE_SPEC_CTRL;
 957
 958         return flags;
 959 }
 960
 961 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 962                 unsigned long entry, unsigned long exit)
 963 {
 964         vm_entry_controls_clearbit(vmx, entry);
 965         vm_exit_controls_clearbit(vmx, exit);
 966 }
 967
 968 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
 969 {
 970         unsigned int i;
 971
 972         for (i = 0; i < m->nr; ++i) {
 973                 if (m->val[i].index == msr)
 974                         return i;
 975         }
 976         return -ENOENT;
 977 }
 978
 979 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 980 {
 981         int i;
 982         struct msr_autoload *m = &vmx->msr_autoload;
 983
 984         switch (msr) {
 985         case MSR_EFER:
 986                 if (cpu_has_load_ia32_efer()) {
 987                         clear_atomic_switch_msr_special(vmx,
 988                                         VM_ENTRY_LOAD_IA32_EFER,
 989                                         VM_EXIT_LOAD_IA32_EFER);
 990                         return;
 991                 }
 992                 break;
 993         case MSR_CORE_PERF_GLOBAL_CTRL:
 994                 if (cpu_has_load_perf_global_ctrl()) {
 995                         clear_atomic_switch_msr_special(vmx,
 996                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
 997                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
 998                         return;
 999                 }
1000                 break;
1001         }
1002         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
1003         if (i < 0)
1004                 goto skip_guest;
1005         --m->guest.nr;
1006         m->guest.val[i] = m->guest.val[m->guest.nr];
1007         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1008
1009 skip_guest:
1010         i = vmx_find_loadstore_msr_slot(&m->host, msr);
1011         if (i < 0)
1012                 return;
1013
1014         --m->host.nr;
1015         m->host.val[i] = m->host.val[m->host.nr];
1016         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1017 }
1018
1019 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1020                 unsigned long entry, unsigned long exit,
1021                 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1022                 u64 guest_val, u64 host_val)
1023 {
1024         vmcs_write64(guest_val_vmcs, guest_val);
1025         if (host_val_vmcs != HOST_IA32_EFER)
1026                 vmcs_write64(host_val_vmcs, host_val);
1027         vm_entry_controls_setbit(vmx, entry);
1028         vm_exit_controls_setbit(vmx, exit);
1029 }
1030
1031 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1032                                   u64 guest_val, u64 host_val, bool entry_only)
1033 {
1034         int i, j = 0;
1035         struct msr_autoload *m = &vmx->msr_autoload;
1036
1037         switch (msr) {
1038         case MSR_EFER:
1039                 if (cpu_has_load_ia32_efer()) {
1040                         add_atomic_switch_msr_special(vmx,
1041                                         VM_ENTRY_LOAD_IA32_EFER,
1042                                         VM_EXIT_LOAD_IA32_EFER,
1043                                         GUEST_IA32_EFER,
1044                                         HOST_IA32_EFER,
1045                                         guest_val, host_val);
1046                         return;
1047                 }
1048                 break;
1049         case MSR_CORE_PERF_GLOBAL_CTRL:
1050                 if (cpu_has_load_perf_global_ctrl()) {
1051                         add_atomic_switch_msr_special(vmx,
1052                                         VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1053                                         VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1054                                         GUEST_IA32_PERF_GLOBAL_CTRL,
1055                                         HOST_IA32_PERF_GLOBAL_CTRL,
1056                                         guest_val, host_val);
1057                         return;
1058                 }
1059                 break;
1060         case MSR_IA32_PEBS_ENABLE:
1061                 /* PEBS needs a quiescent period after being disabled (to write
1062                  * a record).  Disabling PEBS through VMX MSR swapping doesn't
1063                  * provide that period, so a CPU could write host's record into
1064                  * guest's memory.
1065                  */
1066                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1067         }
1068
1069         i = vmx_find_loadstore_msr_slot(&m->guest, msr);
1070         if (!entry_only)
1071                 j = vmx_find_loadstore_msr_slot(&m->host, msr);
1072
1073         if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
1074             (j < 0 &&  m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
1075                 printk_once(KERN_WARNING "Not enough msr switch entries. "
1076                                 "Can't add msr %x\n", msr);
1077                 return;
1078         }
1079         if (i < 0) {
1080                 i = m->guest.nr++;
1081                 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1082         }
1083         m->guest.val[i].index = msr;
1084         m->guest.val[i].value = guest_val;
1085
1086         if (entry_only)
1087                 return;
1088
1089         if (j < 0) {
1090                 j = m->host.nr++;
1091                 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1092         }
1093         m->host.val[j].index = msr;
1094         m->host.val[j].value = host_val;
1095 }
1096
1097 static bool update_transition_efer(struct vcpu_vmx *vmx)
1098 {
1099         u64 guest_efer = vmx->vcpu.arch.efer;
1100         u64 ignore_bits = 0;
1101         int i;
1102
1103         /* Shadow paging assumes NX to be available.  */
1104         if (!enable_ept)
1105                 guest_efer |= EFER_NX;
1106
1107         /*
1108          * LMA and LME handled by hardware; SCE meaningless outside long mode.
1109          */
1110         ignore_bits |= EFER_SCE;
1111 #ifdef CONFIG_X86_64
1112         ignore_bits |= EFER_LMA | EFER_LME;
1113         /* SCE is meaningful only in long mode on Intel */
1114         if (guest_efer & EFER_LMA)
1115                 ignore_bits &= ~(u64)EFER_SCE;
1116 #endif
1117
1118         /*
1119          * On EPT, we can't emulate NX, so we must switch EFER atomically.
1120          * On CPUs that support "load IA32_EFER", always switch EFER
1121          * atomically, since it's faster than switching it manually.
1122          */
1123         if (cpu_has_load_ia32_efer() ||
1124             (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
1125                 if (!(guest_efer & EFER_LMA))
1126                         guest_efer &= ~EFER_LME;
1127                 if (guest_efer != kvm_host.efer)
1128                         add_atomic_switch_msr(vmx, MSR_EFER,
1129                                               guest_efer, kvm_host.efer, false);
1130                 else
1131                         clear_atomic_switch_msr(vmx, MSR_EFER);
1132                 return false;
1133         }
1134
1135         i = kvm_find_user_return_msr(MSR_EFER);
1136         if (i < 0)
1137                 return false;
1138
1139         clear_atomic_switch_msr(vmx, MSR_EFER);
1140
1141         guest_efer &= ~ignore_bits;
1142         guest_efer |= kvm_host.efer & ignore_bits;
1143
1144         vmx->guest_uret_msrs[i].data = guest_efer;
1145         vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1146
1147         return true;
1148 }
1149
1150 #ifdef CONFIG_X86_32
1151 /*
1152  * On 32-bit kernels, VM exits still load the FS and GS bases from the
1153  * VMCS rather than the segment table.  KVM uses this helper to figure
1154  * out the current bases to poke them into the VMCS before entry.
1155  */
1156 static unsigned long segment_base(u16 selector)
1157 {
1158         struct desc_struct *table;
1159         unsigned long v;
1160
1161         if (!(selector & ~SEGMENT_RPL_MASK))
1162                 return 0;
1163
1164         table = get_current_gdt_ro();
1165
1166         if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1167                 u16 ldt_selector = kvm_read_ldt();
1168
1169                 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1170                         return 0;
1171
1172                 table = (struct desc_struct *)segment_base(ldt_selector);
1173         }
1174         v = get_desc_base(&table[selector >> 3]);
1175         return v;
1176 }
1177 #endif
1178
1179 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1180 {
1181         return vmx_pt_mode_is_host_guest() &&
1182                !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1183 }
1184
1185 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1186 {
1187         /* The base must be 128-byte aligned and a legal physical address. */
1188         return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
1189 }
1190
1191 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1192 {
1193         u32 i;
1194
1195         wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1196         wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1197         wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1198         wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1199         for (i = 0; i < addr_range; i++) {
1200                 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1201                 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1202         }
1203 }
1204
1205 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1206 {
1207         u32 i;
1208
1209         rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1210         rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1211         rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1212         rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1213         for (i = 0; i < addr_range; i++) {
1214                 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1215                 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1216         }
1217 }
1218
1219 static void pt_guest_enter(struct vcpu_vmx *vmx)
1220 {
1221         if (vmx_pt_mode_is_system())
1222                 return;
1223
1224         /*
1225          * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1226          * Save host state before VM entry.
1227          */
1228         rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1229         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1230                 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1231                 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1232                 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1233         }
1234 }
1235
1236 static void pt_guest_exit(struct vcpu_vmx *vmx)
1237 {
1238         if (vmx_pt_mode_is_system())
1239                 return;
1240
1241         if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1242                 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
1243                 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
1244         }
1245
1246         /*
1247          * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1248          * i.e. RTIT_CTL is always cleared on VM-Exit.  Restore it if necessary.
1249          */
1250         if (vmx->pt_desc.host.ctl)
1251                 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1252 }
1253
1254 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1255                         unsigned long fs_base, unsigned long gs_base)
1256 {
1257         if (unlikely(fs_sel != host->fs_sel)) {
1258                 if (!(fs_sel & 7))
1259                         vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1260                 else
1261                         vmcs_write16(HOST_FS_SELECTOR, 0);
1262                 host->fs_sel = fs_sel;
1263         }
1264         if (unlikely(gs_sel != host->gs_sel)) {
1265                 if (!(gs_sel & 7))
1266                         vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1267                 else
1268                         vmcs_write16(HOST_GS_SELECTOR, 0);
1269                 host->gs_sel = gs_sel;
1270         }
1271         if (unlikely(fs_base != host->fs_base)) {
1272                 vmcs_writel(HOST_FS_BASE, fs_base);
1273                 host->fs_base = fs_base;
1274         }
1275         if (unlikely(gs_base != host->gs_base)) {
1276                 vmcs_writel(HOST_GS_BASE, gs_base);
1277                 host->gs_base = gs_base;
1278         }
1279 }
1280
1281 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1282 {
1283         struct vcpu_vmx *vmx = to_vmx(vcpu);
1284         struct vmcs_host_state *host_state;
1285 #ifdef CONFIG_X86_64
1286         int cpu = raw_smp_processor_id();
1287 #endif
1288         unsigned long fs_base, gs_base;
1289         u16 fs_sel, gs_sel;
1290         int i;
1291
1292         /*
1293          * Note that guest MSRs to be saved/restored can also be changed
1294          * when guest state is loaded. This happens when guest transitions
1295          * to/from long-mode by setting MSR_EFER.LMA.
1296          */
1297         if (!vmx->guest_uret_msrs_loaded) {
1298                 vmx->guest_uret_msrs_loaded = true;
1299                 for (i = 0; i < kvm_nr_uret_msrs; ++i) {
1300                         if (!vmx->guest_uret_msrs[i].load_into_hardware)
1301                                 continue;
1302
1303                         kvm_set_user_return_msr(i,
1304                                                 vmx->guest_uret_msrs[i].data,
1305                                                 vmx->guest_uret_msrs[i].mask);
1306                 }
1307         }
1308
1309         if (vmx->nested.need_vmcs12_to_shadow_sync)
1310                 nested_sync_vmcs12_to_shadow(vcpu);
1311
1312         if (vmx->guest_state_loaded)
1313                 return;
1314
1315         host_state = &vmx->loaded_vmcs->host_state;
1316
1317         /*
1318          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1319          * allow segment selectors with cpl > 0 or ti == 1.
1320          */
1321         host_state->ldt_sel = kvm_read_ldt();
1322
1323 #ifdef CONFIG_X86_64
1324         savesegment(ds, host_state->ds_sel);
1325         savesegment(es, host_state->es_sel);
1326
1327         gs_base = cpu_kernelmode_gs_base(cpu);
1328         if (likely(is_64bit_mm(current->mm))) {
1329                 current_save_fsgs();
1330                 fs_sel = current->thread.fsindex;
1331                 gs_sel = current->thread.gsindex;
1332                 fs_base = current->thread.fsbase;
1333                 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1334         } else {
1335                 savesegment(fs, fs_sel);
1336                 savesegment(gs, gs_sel);
1337                 fs_base = read_msr(MSR_FS_BASE);
1338                 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1339         }
1340
1341         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1342 #else
1343         savesegment(fs, fs_sel);
1344         savesegment(gs, gs_sel);
1345         fs_base = segment_base(fs_sel);
1346         gs_base = segment_base(gs_sel);
1347 #endif
1348
1349         vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
1350         vmx->guest_state_loaded = true;
1351 }
1352
1353 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1354 {
1355         struct vmcs_host_state *host_state;
1356
1357         if (!vmx->guest_state_loaded)
1358                 return;
1359
1360         host_state = &vmx->loaded_vmcs->host_state;
1361
1362         ++vmx->vcpu.stat.host_state_reload;
1363
1364 #ifdef CONFIG_X86_64
1365         rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1366 #endif
1367         if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1368                 kvm_load_ldt(host_state->ldt_sel);
1369 #ifdef CONFIG_X86_64
1370                 load_gs_index(host_state->gs_sel);
1371 #else
1372                 loadsegment(gs, host_state->gs_sel);
1373 #endif
1374         }
1375         if (host_state->fs_sel & 7)
1376                 loadsegment(fs, host_state->fs_sel);
1377 #ifdef CONFIG_X86_64
1378         if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1379                 loadsegment(ds, host_state->ds_sel);
1380                 loadsegment(es, host_state->es_sel);
1381         }
1382 #endif
1383         invalidate_tss_limit();
1384 #ifdef CONFIG_X86_64
1385         wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1386 #endif
1387         load_fixmap_gdt(raw_smp_processor_id());
1388         vmx->guest_state_loaded = false;
1389         vmx->guest_uret_msrs_loaded = false;
1390 }
1391
1392 #ifdef CONFIG_X86_64
1393 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1394 {
1395         preempt_disable();
1396         if (vmx->guest_state_loaded)
1397                 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1398         preempt_enable();
1399         return vmx->msr_guest_kernel_gs_base;
1400 }
1401
1402 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1403 {
1404         preempt_disable();
1405         if (vmx->guest_state_loaded)
1406                 wrmsrl(MSR_KERNEL_GS_BASE, data);
1407         preempt_enable();
1408         vmx->msr_guest_kernel_gs_base = data;
1409 }
1410 #endif
1411
1412 static void grow_ple_window(struct kvm_vcpu *vcpu)
1413 {
1414         struct vcpu_vmx *vmx = to_vmx(vcpu);
1415         unsigned int old = vmx->ple_window;
1416
1417         vmx->ple_window = __grow_ple_window(old, ple_window,
1418                                             ple_window_grow,
1419                                             ple_window_max);
1420
1421         if (vmx->ple_window != old) {
1422                 vmx->ple_window_dirty = true;
1423                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1424                                             vmx->ple_window, old);
1425         }
1426 }
1427
1428 static void shrink_ple_window(struct kvm_vcpu *vcpu)
1429 {
1430         struct vcpu_vmx *vmx = to_vmx(vcpu);
1431         unsigned int old = vmx->ple_window;
1432
1433         vmx->ple_window = __shrink_ple_window(old, ple_window,
1434                                               ple_window_shrink,
1435                                               ple_window);
1436
1437         if (vmx->ple_window != old) {
1438                 vmx->ple_window_dirty = true;
1439                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1440                                             vmx->ple_window, old);
1441         }
1442 }
1443
1444 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
1445                         struct loaded_vmcs *buddy)
1446 {
1447         struct vcpu_vmx *vmx = to_vmx(vcpu);
1448         bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1449         struct vmcs *prev;
1450
1451         if (!already_loaded) {
1452                 loaded_vmcs_clear(vmx->loaded_vmcs);
1453                 local_irq_disable();
1454
1455                 /*
1456                  * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1457                  * this cpu's percpu list, otherwise it may not yet be deleted
1458                  * from its previous cpu's percpu list.  Pairs with the
1459                  * smb_wmb() in __loaded_vmcs_clear().
1460                  */
1461                 smp_rmb();
1462
1463                 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1464                          &per_cpu(loaded_vmcss_on_cpu, cpu));
1465                 local_irq_enable();
1466         }
1467
1468         prev = per_cpu(current_vmcs, cpu);
1469         if (prev != vmx->loaded_vmcs->vmcs) {
1470                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1471                 vmcs_load(vmx->loaded_vmcs->vmcs);
1472
1473                 /*
1474                  * No indirect branch prediction barrier needed when switching
1475                  * the active VMCS within a vCPU, unless IBRS is advertised to
1476                  * the vCPU.  To minimize the number of IBPBs executed, KVM
1477                  * performs IBPB on nested VM-Exit (a single nested transition
1478                  * may switch the active VMCS multiple times).
1479                  */
1480                 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
1481                         indirect_branch_prediction_barrier();
1482         }
1483
1484         if (!already_loaded) {
1485                 void *gdt = get_current_gdt_ro();
1486
1487                 /*
1488                  * Flush all EPTP/VPID contexts, the new pCPU may have stale
1489                  * TLB entries from its previous association with the vCPU.
1490                  */
1491                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1492
1493                 /*
1494                  * Linux uses per-cpu TSS and GDT, so set these when switching
1495                  * processors.  See 22.2.4.
1496                  */
1497                 vmcs_writel(HOST_TR_BASE,
1498                             (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1499                 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
1500
1501                 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
1502                         /* 22.2.3 */
1503                         vmcs_writel(HOST_IA32_SYSENTER_ESP,
1504                                     (unsigned long)(cpu_entry_stack(cpu) + 1));
1505                 }
1506
1507                 vmx->loaded_vmcs->cpu = cpu;
1508         }
1509 }
1510
1511 /*
1512  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1513  * vcpu mutex is already taken.
1514  */
1515 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1516 {
1517         struct vcpu_vmx *vmx = to_vmx(vcpu);
1518
1519         if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
1520                 shrink_ple_window(vcpu);
1521
1522         vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1523
1524         vmx_vcpu_pi_load(vcpu, cpu);
1525
1526         vmx->host_debugctlmsr = get_debugctlmsr();
1527 }
1528
1529 void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1530 {
1531         vmx_vcpu_pi_put(vcpu);
1532
1533         vmx_prepare_switch_to_host(to_vmx(vcpu));
1534 }
1535
1536 bool vmx_emulation_required(struct kvm_vcpu *vcpu)
1537 {
1538         return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1539 }
1540
1541 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1542 {
1543         struct vcpu_vmx *vmx = to_vmx(vcpu);
1544         unsigned long rflags, save_rflags;
1545
1546         if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1547                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1548                 rflags = vmcs_readl(GUEST_RFLAGS);
1549                 if (vmx->rmode.vm86_active) {
1550                         rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1551                         save_rflags = vmx->rmode.save_rflags;
1552                         rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1553                 }
1554                 vmx->rflags = rflags;
1555         }
1556         return vmx->rflags;
1557 }
1558
1559 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1560 {
1561         struct vcpu_vmx *vmx = to_vmx(vcpu);
1562         unsigned long old_rflags;
1563
1564         /*
1565          * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
1566          * is an unrestricted guest in order to mark L2 as needing emulation
1567          * if L1 runs L2 as a restricted guest.
1568          */
1569         if (is_unrestricted_guest(vcpu)) {
1570                 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
1571                 vmx->rflags = rflags;
1572                 vmcs_writel(GUEST_RFLAGS, rflags);
1573                 return;
1574         }
1575
1576         old_rflags = vmx_get_rflags(vcpu);
1577         vmx->rflags = rflags;
1578         if (vmx->rmode.vm86_active) {
1579                 vmx->rmode.save_rflags = rflags;
1580                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1581         }
1582         vmcs_writel(GUEST_RFLAGS, rflags);
1583
1584         if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1585                 vmx->emulation_required = vmx_emulation_required(vcpu);
1586 }
1587
1588 bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1589 {
1590         return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1591 }
1592
1593 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1594 {
1595         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1596         int ret = 0;
1597
1598         if (interruptibility & GUEST_INTR_STATE_STI)
1599                 ret |= KVM_X86_SHADOW_INT_STI;
1600         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1601                 ret |= KVM_X86_SHADOW_INT_MOV_SS;
1602
1603         return ret;
1604 }
1605
1606 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
1607 {
1608         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1609         u32 interruptibility = interruptibility_old;
1610
1611         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1612
1613         if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1614                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
1615         else if (mask & KVM_X86_SHADOW_INT_STI)
1616                 interruptibility |= GUEST_INTR_STATE_STI;
1617
1618         if ((interruptibility != interruptibility_old))
1619                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1620 }
1621
1622 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1623 {
1624         struct vcpu_vmx *vmx = to_vmx(vcpu);
1625         unsigned long value;
1626
1627         /*
1628          * Any MSR write that attempts to change bits marked reserved will
1629          * case a #GP fault.
1630          */
1631         if (data & vmx->pt_desc.ctl_bitmask)
1632                 return 1;
1633
1634         /*
1635          * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1636          * result in a #GP unless the same write also clears TraceEn.
1637          */
1638         if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1639                 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1640                 return 1;
1641
1642         /*
1643          * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1644          * and FabricEn would cause #GP, if
1645          * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1646          */
1647         if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1648                 !(data & RTIT_CTL_FABRIC_EN) &&
1649                 !intel_pt_validate_cap(vmx->pt_desc.caps,
1650                                         PT_CAP_single_range_output))
1651                 return 1;
1652
1653         /*
1654          * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1655          * utilize encodings marked reserved will cause a #GP fault.
1656          */
1657         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1658         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1659                         !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1660                         RTIT_CTL_MTC_RANGE_OFFSET, &value))
1661                 return 1;
1662         value = intel_pt_validate_cap(vmx->pt_desc.caps,
1663                                                 PT_CAP_cycle_thresholds);
1664         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1665                         !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1666                         RTIT_CTL_CYC_THRESH_OFFSET, &value))
1667                 return 1;
1668         value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1669         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1670                         !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1671                         RTIT_CTL_PSB_FREQ_OFFSET, &value))
1672                 return 1;
1673
1674         /*
1675          * If ADDRx_CFG is reserved or the encodings is >2 will
1676          * cause a #GP fault.
1677          */
1678         value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1679         if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
1680                 return 1;
1681         value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1682         if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
1683                 return 1;
1684         value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1685         if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
1686                 return 1;
1687         value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1688         if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
1689                 return 1;
1690
1691         return 0;
1692 }
1693
1694 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1695                                   void *insn, int insn_len)
1696 {
1697         /*
1698          * Emulation of instructions in SGX enclaves is impossible as RIP does
1699          * not point at the failing instruction, and even if it did, the code
1700          * stream is inaccessible.  Inject #UD instead of exiting to userspace
1701          * so that guest userspace can't DoS the guest simply by triggering
1702          * emulation (enclaves are CPL3 only).
1703          */
1704         if (to_vmx(vcpu)->exit_reason.enclave_mode) {
1705                 kvm_queue_exception(vcpu, UD_VECTOR);
1706                 return X86EMUL_PROPAGATE_FAULT;
1707         }
1708         return X86EMUL_CONTINUE;
1709 }
1710
1711 static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1712 {
1713         union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
1714         unsigned long rip, orig_rip;
1715         u32 instr_len;
1716
1717         /*
1718          * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1719          * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1720          * set when EPT misconfig occurs.  In practice, real hardware updates
1721          * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1722          * (namely Hyper-V) don't set it due to it being undefined behavior,
1723          * i.e. we end up advancing IP with some random value.
1724          */
1725         if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1726             exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1727                 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1728
1729                 /*
1730                  * Emulating an enclave's instructions isn't supported as KVM
1731                  * cannot access the enclave's memory or its true RIP, e.g. the
1732                  * vmcs.GUEST_RIP points at the exit point of the enclave, not
1733                  * the RIP that actually triggered the VM-Exit.  But, because
1734                  * most instructions that cause VM-Exit will #UD in an enclave,
1735                  * most instruction-based VM-Exits simply do not occur.
1736                  *
1737                  * There are a few exceptions, notably the debug instructions
1738                  * INT1ICEBRK and INT3, as they are allowed in debug enclaves
1739                  * and generate #DB/#BP as expected, which KVM might intercept.
1740                  * But again, the CPU does the dirty work and saves an instr
1741                  * length of zero so VMMs don't shoot themselves in the foot.
1742                  * WARN if KVM tries to skip a non-zero length instruction on
1743                  * a VM-Exit from an enclave.
1744                  */
1745                 if (!instr_len)
1746                         goto rip_updated;
1747
1748                 WARN_ONCE(exit_reason.enclave_mode,
1749                           "skipping instruction after SGX enclave VM-Exit");
1750
1751                 orig_rip = kvm_rip_read(vcpu);
1752                 rip = orig_rip + instr_len;
1753 #ifdef CONFIG_X86_64
1754                 /*
1755                  * We need to mask out the high 32 bits of RIP if not in 64-bit
1756                  * mode, but just finding out that we are in 64-bit mode is
1757                  * quite expensive.  Only do it if there was a carry.
1758                  */
1759                 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
1760                         rip = (u32)rip;
1761 #endif
1762                 kvm_rip_write(vcpu, rip);
1763         } else {
1764                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1765                         return 0;
1766         }
1767
1768 rip_updated:
1769         /* skipping an emulated instruction also counts */
1770         vmx_set_interrupt_shadow(vcpu, 0);
1771
1772         return 1;
1773 }
1774
1775 /*
1776  * Recognizes a pending MTF VM-exit and records the nested state for later
1777  * delivery.
1778  */
1779 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1780 {
1781         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1782         struct vcpu_vmx *vmx = to_vmx(vcpu);
1783
1784         if (!is_guest_mode(vcpu))
1785                 return;
1786
1787         /*
1788          * Per the SDM, MTF takes priority over debug-trap exceptions besides
1789          * TSS T-bit traps and ICEBP (INT1).  KVM doesn't emulate T-bit traps
1790          * or ICEBP (in the emulator proper), and skipping of ICEBP after an
1791          * intercepted #DB deliberately avoids single-step #DB and MTF updates
1792          * as ICEBP is higher priority than both.  As instruction emulation is
1793          * completed at this point (i.e. KVM is at the instruction boundary),
1794          * any #DB exception pending delivery must be a debug-trap of lower
1795          * priority than MTF.  Record the pending MTF state to be delivered in
1796          * vmx_check_nested_events().
1797          */
1798         if (nested_cpu_has_mtf(vmcs12) &&
1799             (!vcpu->arch.exception.pending ||
1800              vcpu->arch.exception.vector == DB_VECTOR) &&
1801             (!vcpu->arch.exception_vmexit.pending ||
1802              vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
1803                 vmx->nested.mtf_pending = true;
1804                 kvm_make_request(KVM_REQ_EVENT, vcpu);
1805         } else {
1806                 vmx->nested.mtf_pending = false;
1807         }
1808 }
1809
1810 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1811 {
1812         vmx_update_emulated_instruction(vcpu);
1813         return skip_emulated_instruction(vcpu);
1814 }
1815
1816 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1817 {
1818         /*
1819          * Ensure that we clear the HLT state in the VMCS.  We don't need to
1820          * explicitly skip the instruction because if the HLT state is set,
1821          * then the instruction is already executing and RIP has already been
1822          * advanced.
1823          */
1824         if (kvm_hlt_in_guest(vcpu->kvm) &&
1825                         vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1826                 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1827 }
1828
1829 void vmx_inject_exception(struct kvm_vcpu *vcpu)
1830 {
1831         struct kvm_queued_exception *ex = &vcpu->arch.exception;
1832         u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
1833         struct vcpu_vmx *vmx = to_vmx(vcpu);
1834
1835         kvm_deliver_exception_payload(vcpu, ex);
1836
1837         if (ex->has_error_code) {
1838                 /*
1839                  * Despite the error code being architecturally defined as 32
1840                  * bits, and the VMCS field being 32 bits, Intel CPUs and thus
1841                  * VMX don't actually supporting setting bits 31:16.  Hardware
1842                  * will (should) never provide a bogus error code, but AMD CPUs
1843                  * do generate error codes with bits 31:16 set, and so KVM's
1844                  * ABI lets userspace shove in arbitrary 32-bit values.  Drop
1845                  * the upper bits to avoid VM-Fail, losing information that
1846                  * doesn't really exist is preferable to killing the VM.
1847                  */
1848                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
1849                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1850         }
1851
1852         if (vmx->rmode.vm86_active) {
1853                 int inc_eip = 0;
1854                 if (kvm_exception_is_soft(ex->vector))
1855                         inc_eip = vcpu->arch.event_exit_inst_len;
1856                 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
1857                 return;
1858         }
1859
1860         WARN_ON_ONCE(vmx->emulation_required);
1861
1862         if (kvm_exception_is_soft(ex->vector)) {
1863                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1864                              vmx->vcpu.arch.event_exit_inst_len);
1865                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1866         } else
1867                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1868
1869         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1870
1871         vmx_clear_hlt(vcpu);
1872 }
1873
1874 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
1875                                bool load_into_hardware)
1876 {
1877         struct vmx_uret_msr *uret_msr;
1878
1879         uret_msr = vmx_find_uret_msr(vmx, msr);
1880         if (!uret_msr)
1881                 return;
1882
1883         uret_msr->load_into_hardware = load_into_hardware;
1884 }
1885
1886 /*
1887  * Configuring user return MSRs to automatically save, load, and restore MSRs
1888  * that need to be shoved into hardware when running the guest.  Note, omitting
1889  * an MSR here does _NOT_ mean it's not emulated, only that it will not be
1890  * loaded into hardware when running the guest.
1891  */
1892 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
1893 {
1894 #ifdef CONFIG_X86_64
1895         bool load_syscall_msrs;
1896
1897         /*
1898          * The SYSCALL MSRs are only needed on long mode guests, and only
1899          * when EFER.SCE is set.
1900          */
1901         load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
1902                             (vmx->vcpu.arch.efer & EFER_SCE);
1903
1904         vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
1905         vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
1906         vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
1907 #endif
1908         vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
1909
1910         vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1911                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
1912                            guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
1913
1914         /*
1915          * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1916          * kernel and old userspace.  If those guests run on a tsx=off host, do
1917          * allow guests to use TSX_CTRL, but don't change the value in hardware
1918          * so that TSX remains always disabled.
1919          */
1920         vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
1921
1922         /*
1923          * The set of MSRs to load may have changed, reload MSRs before the
1924          * next VM-Enter.
1925          */
1926         vmx->guest_uret_msrs_loaded = false;
1927 }
1928
1929 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1930 {
1931         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1932
1933         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
1934                 return vmcs12->tsc_offset;
1935
1936         return 0;
1937 }
1938
1939 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1940 {
1941         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1942
1943         if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
1944             nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
1945                 return vmcs12->tsc_multiplier;
1946
1947         return kvm_caps.default_tsc_scaling_ratio;
1948 }
1949
1950 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
1951 {
1952         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
1953 }
1954
1955 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
1956 {
1957         vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
1958 }
1959
1960 /*
1961  * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
1962  * guest CPUID.  Note, KVM allows userspace to set "VMX in SMX" to maintain
1963  * backwards compatibility even though KVM doesn't support emulating SMX.  And
1964  * because userspace set "VMX in SMX", the guest must also be allowed to set it,
1965  * e.g. if the MSR is left unlocked and the guest does a RMW operation.
1966  */
1967 #define KVM_SUPPORTED_FEATURE_CONTROL  (FEAT_CTL_LOCKED                  | \
1968                                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX  | \
1969                                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
1970                                         FEAT_CTL_SGX_LC_ENABLED          | \
1971                                         FEAT_CTL_SGX_ENABLED             | \
1972                                         FEAT_CTL_LMCE_ENABLED)
1973
1974 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
1975                                                     struct msr_data *msr)
1976 {
1977         uint64_t valid_bits;
1978
1979         /*
1980          * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
1981          * exposed to the guest.
1982          */
1983         WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
1984                      ~KVM_SUPPORTED_FEATURE_CONTROL);
1985
1986         if (!msr->host_initiated &&
1987             (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
1988                 return false;
1989
1990         if (msr->host_initiated)
1991                 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
1992         else
1993                 valid_bits = vmx->msr_ia32_feature_control_valid_bits;
1994
1995         return !(msr->data & ~valid_bits);
1996 }
1997
1998 int vmx_get_feature_msr(u32 msr, u64 *data)
1999 {
2000         switch (msr) {
2001         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2002                 if (!nested)
2003                         return 1;
2004                 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
2005         default:
2006                 return KVM_MSR_RET_UNSUPPORTED;
2007         }
2008 }
2009
2010 /*
2011  * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
2012  * Returns 0 on success, non-0 otherwise.
2013  * Assumes vcpu_load() was already called.
2014  */
2015 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2016 {
2017         struct vcpu_vmx *vmx = to_vmx(vcpu);
2018         struct vmx_uret_msr *msr;
2019         u32 index;
2020
2021         switch (msr_info->index) {
2022 #ifdef CONFIG_X86_64
2023         case MSR_FS_BASE:
2024                 msr_info->data = vmcs_readl(GUEST_FS_BASE);
2025                 break;
2026         case MSR_GS_BASE:
2027                 msr_info->data = vmcs_readl(GUEST_GS_BASE);
2028                 break;
2029         case MSR_KERNEL_GS_BASE:
2030                 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
2031                 break;
2032 #endif
2033         case MSR_EFER:
2034                 return kvm_get_msr_common(vcpu, msr_info);
2035         case MSR_IA32_TSX_CTRL:
2036                 if (!msr_info->host_initiated &&
2037                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2038                         return 1;
2039                 goto find_uret_msr;
2040         case MSR_IA32_UMWAIT_CONTROL:
2041                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2042                         return 1;
2043
2044                 msr_info->data = vmx->msr_ia32_umwait_control;
2045                 break;
2046         case MSR_IA32_SPEC_CTRL:
2047                 if (!msr_info->host_initiated &&
2048                     !guest_has_spec_ctrl_msr(vcpu))
2049                         return 1;
2050
2051                 msr_info->data = to_vmx(vcpu)->spec_ctrl;
2052                 break;
2053         case MSR_IA32_SYSENTER_CS:
2054                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2055                 break;
2056         case MSR_IA32_SYSENTER_EIP:
2057                 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2058                 break;
2059         case MSR_IA32_SYSENTER_ESP:
2060                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2061                 break;
2062         case MSR_IA32_BNDCFGS:
2063                 if (!kvm_mpx_supported() ||
2064                     (!msr_info->host_initiated &&
2065                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2066                         return 1;
2067                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2068                 break;
2069         case MSR_IA32_MCG_EXT_CTL:
2070                 if (!msr_info->host_initiated &&
2071                     !(vmx->msr_ia32_feature_control &
2072                       FEAT_CTL_LMCE_ENABLED))
2073                         return 1;
2074                 msr_info->data = vcpu->arch.mcg_ext_ctl;
2075                 break;
2076         case MSR_IA32_FEAT_CTL:
2077                 msr_info->data = vmx->msr_ia32_feature_control;
2078                 break;
2079         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2080                 if (!msr_info->host_initiated &&
2081                     !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
2082                         return 1;
2083                 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2084                         [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2085                 break;
2086         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2087                 if (!guest_can_use(vcpu, X86_FEATURE_VMX))
2088                         return 1;
2089                 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2090                                     &msr_info->data))
2091                         return 1;
2092 #ifdef CONFIG_KVM_HYPERV
2093                 /*
2094                  * Enlightened VMCS v1 doesn't have certain VMCS fields but
2095                  * instead of just ignoring the features, different Hyper-V
2096                  * versions are either trying to use them and fail or do some
2097                  * sanity checking and refuse to boot. Filter all unsupported
2098                  * features out.
2099                  */
2100                 if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
2101                         nested_evmcs_filter_control_msr(vcpu, msr_info->index,
2102                                                         &msr_info->data);
2103 #endif
2104                 break;
2105         case MSR_IA32_RTIT_CTL:
2106                 if (!vmx_pt_mode_is_host_guest())
2107                         return 1;
2108                 msr_info->data = vmx->pt_desc.guest.ctl;
2109                 break;
2110         case MSR_IA32_RTIT_STATUS:
2111                 if (!vmx_pt_mode_is_host_guest())
2112                         return 1;
2113                 msr_info->data = vmx->pt_desc.guest.status;
2114                 break;
2115         case MSR_IA32_RTIT_CR3_MATCH:
2116                 if (!vmx_pt_mode_is_host_guest() ||
2117                         !intel_pt_validate_cap(vmx->pt_desc.caps,
2118                                                 PT_CAP_cr3_filtering))
2119                         return 1;
2120                 msr_info->data = vmx->pt_desc.guest.cr3_match;
2121                 break;
2122         case MSR_IA32_RTIT_OUTPUT_BASE:
2123                 if (!vmx_pt_mode_is_host_guest() ||
2124                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
2125                                         PT_CAP_topa_output) &&
2126                          !intel_pt_validate_cap(vmx->pt_desc.caps,
2127                                         PT_CAP_single_range_output)))
2128                         return 1;
2129                 msr_info->data = vmx->pt_desc.guest.output_base;
2130                 break;
2131         case MSR_IA32_RTIT_OUTPUT_MASK:
2132                 if (!vmx_pt_mode_is_host_guest() ||
2133                         (!intel_pt_validate_cap(vmx->pt_desc.caps,
2134                                         PT_CAP_topa_output) &&
2135                          !intel_pt_validate_cap(vmx->pt_desc.caps,
2136                                         PT_CAP_single_range_output)))
2137                         return 1;
2138                 msr_info->data = vmx->pt_desc.guest.output_mask;
2139                 break;
2140         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2141                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2142                 if (!vmx_pt_mode_is_host_guest() ||
2143                     (index >= 2 * vmx->pt_desc.num_address_ranges))
2144                         return 1;
2145                 if (index % 2)
2146                         msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2147                 else
2148                         msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2149                 break;
2150         case MSR_IA32_DEBUGCTLMSR:
2151                 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
2152                 break;
2153         default:
2154         find_uret_msr:
2155                 msr = vmx_find_uret_msr(vmx, msr_info->index);
2156                 if (msr) {
2157                         msr_info->data = msr->data;
2158                         break;
2159                 }
2160                 return kvm_get_msr_common(vcpu, msr_info);
2161         }
2162
2163         return 0;
2164 }
2165
2166 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2167                                                     u64 data)
2168 {
2169 #ifdef CONFIG_X86_64
2170         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
2171                 return (u32)data;
2172 #endif
2173         return (unsigned long)data;
2174 }
2175
2176 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
2177 {
2178         u64 debugctl = 0;
2179
2180         if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
2181             (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
2182                 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
2183
2184         if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
2185             (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
2186                 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
2187
2188         return debugctl;
2189 }
2190
2191 /*
2192  * Writes msr value into the appropriate "register".
2193  * Returns 0 on success, non-0 otherwise.
2194  * Assumes vcpu_load() was already called.
2195  */
2196 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2197 {
2198         struct vcpu_vmx *vmx = to_vmx(vcpu);
2199         struct vmx_uret_msr *msr;
2200         int ret = 0;
2201         u32 msr_index = msr_info->index;
2202         u64 data = msr_info->data;
2203         u32 index;
2204
2205         switch (msr_index) {
2206         case MSR_EFER:
2207                 ret = kvm_set_msr_common(vcpu, msr_info);
2208                 break;
2209 #ifdef CONFIG_X86_64
2210         case MSR_FS_BASE:
2211                 vmx_segment_cache_clear(vmx);
2212                 vmcs_writel(GUEST_FS_BASE, data);
2213                 break;
2214         case MSR_GS_BASE:
2215                 vmx_segment_cache_clear(vmx);
2216                 vmcs_writel(GUEST_GS_BASE, data);
2217                 break;
2218         case MSR_KERNEL_GS_BASE:
2219                 vmx_write_guest_kernel_gs_base(vmx, data);
2220                 break;
2221         case MSR_IA32_XFD:
2222                 ret = kvm_set_msr_common(vcpu, msr_info);
2223                 /*
2224                  * Always intercepting WRMSR could incur non-negligible
2225                  * overhead given xfd might be changed frequently in
2226                  * guest context switch. Disable write interception
2227                  * upon the first write with a non-zero value (indicating
2228                  * potential usage on dynamic xfeatures). Also update
2229                  * exception bitmap to trap #NM for proper virtualization
2230                  * of guest xfd_err.
2231                  */
2232                 if (!ret && data) {
2233                         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2234                                                       MSR_TYPE_RW);
2235                         vcpu->arch.xfd_no_write_intercept = true;
2236                         vmx_update_exception_bitmap(vcpu);
2237                 }
2238                 break;
2239 #endif
2240         case MSR_IA32_SYSENTER_CS:
2241                 if (is_guest_mode(vcpu))
2242                         get_vmcs12(vcpu)->guest_sysenter_cs = data;
2243                 vmcs_write32(GUEST_SYSENTER_CS, data);
2244                 break;
2245         case MSR_IA32_SYSENTER_EIP:
2246                 if (is_guest_mode(vcpu)) {
2247                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2248                         get_vmcs12(vcpu)->guest_sysenter_eip = data;
2249                 }
2250                 vmcs_writel(GUEST_SYSENTER_EIP, data);
2251                 break;
2252         case MSR_IA32_SYSENTER_ESP:
2253                 if (is_guest_mode(vcpu)) {
2254                         data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2255                         get_vmcs12(vcpu)->guest_sysenter_esp = data;
2256                 }
2257                 vmcs_writel(GUEST_SYSENTER_ESP, data);
2258                 break;
2259         case MSR_IA32_DEBUGCTLMSR: {
2260                 u64 invalid;
2261
2262                 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
2263                 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
2264                         kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
2265                         data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2266                         invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
2267                 }
2268
2269                 if (invalid)
2270                         return 1;
2271
2272                 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2273                                                 VM_EXIT_SAVE_DEBUG_CONTROLS)
2274                         get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2275
2276                 vmcs_write64(GUEST_IA32_DEBUGCTL, data);
2277                 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2278                     (data & DEBUGCTLMSR_LBR))
2279                         intel_pmu_create_guest_lbr_event(vcpu);
2280                 return 0;
2281         }
2282         case MSR_IA32_BNDCFGS:
2283                 if (!kvm_mpx_supported() ||
2284                     (!msr_info->host_initiated &&
2285                      !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2286                         return 1;
2287                 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
2288                     (data & MSR_IA32_BNDCFGS_RSVD))
2289                         return 1;
2290
2291                 if (is_guest_mode(vcpu) &&
2292                     ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
2293                      (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2294                         get_vmcs12(vcpu)->guest_bndcfgs = data;
2295
2296                 vmcs_write64(GUEST_BNDCFGS, data);
2297                 break;
2298         case MSR_IA32_UMWAIT_CONTROL:
2299                 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2300                         return 1;
2301
2302                 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2303                 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2304                         return 1;
2305
2306                 vmx->msr_ia32_umwait_control = data;
2307                 break;
2308         case MSR_IA32_SPEC_CTRL:
2309                 if (!msr_info->host_initiated &&
2310                     !guest_has_spec_ctrl_msr(vcpu))
2311                         return 1;
2312
2313                 if (kvm_spec_ctrl_test_value(data))
2314                         return 1;
2315
2316                 vmx->spec_ctrl = data;
2317                 if (!data)
2318                         break;
2319
2320                 /*
2321                  * For non-nested:
2322                  * When it's written (to non-zero) for the first time, pass
2323                  * it through.
2324                  *
2325                  * For nested:
2326                  * The handling of the MSR bitmap for L2 guests is done in
2327                  * nested_vmx_prepare_msr_bitmap. We should not touch the
2328                  * vmcs02.msr_bitmap here since it gets completely overwritten
2329                  * in the merging. We update the vmcs01 here for L1 as well
2330                  * since it will end up touching the MSR anyway now.
2331                  */
2332                 vmx_disable_intercept_for_msr(vcpu,
2333                                               MSR_IA32_SPEC_CTRL,
2334                                               MSR_TYPE_RW);
2335                 break;
2336         case MSR_IA32_TSX_CTRL:
2337                 if (!msr_info->host_initiated &&
2338                     !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2339                         return 1;
2340                 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2341                         return 1;
2342                 goto find_uret_msr;
2343         case MSR_IA32_CR_PAT:
2344                 ret = kvm_set_msr_common(vcpu, msr_info);
2345                 if (ret)
2346                         break;
2347
2348                 if (is_guest_mode(vcpu) &&
2349                     get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2350                         get_vmcs12(vcpu)->guest_ia32_pat = data;
2351
2352                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
2353                         vmcs_write64(GUEST_IA32_PAT, data);
2354                 break;
2355         case MSR_IA32_MCG_EXT_CTL:
2356                 if ((!msr_info->host_initiated &&
2357                      !(to_vmx(vcpu)->msr_ia32_feature_control &
2358                        FEAT_CTL_LMCE_ENABLED)) ||
2359                     (data & ~MCG_EXT_CTL_LMCE_EN))
2360                         return 1;
2361                 vcpu->arch.mcg_ext_ctl = data;
2362                 break;
2363         case MSR_IA32_FEAT_CTL:
2364                 if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
2365                         return 1;
2366
2367                 vmx->msr_ia32_feature_control = data;
2368                 if (msr_info->host_initiated && data == 0)
2369                         vmx_leave_nested(vcpu);
2370
2371                 /* SGX may be enabled/disabled by guest's firmware */
2372                 vmx_write_encls_bitmap(vcpu, NULL);
2373                 break;
2374         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2375                 /*
2376                  * On real hardware, the LE hash MSRs are writable before
2377                  * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2378                  * at which point SGX related bits in IA32_FEATURE_CONTROL
2379                  * become writable.
2380                  *
2381                  * KVM does not emulate SGX activation for simplicity, so
2382                  * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2383                  * is unlocked.  This is technically not architectural
2384                  * behavior, but it's close enough.
2385                  */
2386                 if (!msr_info->host_initiated &&
2387                     (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
2388                     ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2389                     !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2390                         return 1;
2391                 vmx->msr_ia32_sgxlepubkeyhash
2392                         [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
2393                 break;
2394         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2395                 if (!msr_info->host_initiated)
2396                         return 1; /* they are read-only */
2397                 if (!guest_can_use(vcpu, X86_FEATURE_VMX))
2398                         return 1;
2399                 return vmx_set_vmx_msr(vcpu, msr_index, data);
2400         case MSR_IA32_RTIT_CTL:
2401                 if (!vmx_pt_mode_is_host_guest() ||
2402                         vmx_rtit_ctl_check(vcpu, data) ||
2403                         vmx->nested.vmxon)
2404                         return 1;
2405                 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2406                 vmx->pt_desc.guest.ctl = data;
2407                 pt_update_intercept_for_msr(vcpu);
2408                 break;
2409         case MSR_IA32_RTIT_STATUS:
2410                 if (!pt_can_write_msr(vmx))
2411                         return 1;
2412                 if (data & MSR_IA32_RTIT_STATUS_MASK)
2413                         return 1;
2414                 vmx->pt_desc.guest.status = data;
2415                 break;
2416         case MSR_IA32_RTIT_CR3_MATCH:
2417                 if (!pt_can_write_msr(vmx))
2418                         return 1;
2419                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2420                                            PT_CAP_cr3_filtering))
2421                         return 1;
2422                 vmx->pt_desc.guest.cr3_match = data;
2423                 break;
2424         case MSR_IA32_RTIT_OUTPUT_BASE:
2425                 if (!pt_can_write_msr(vmx))
2426                         return 1;
2427                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2428                                            PT_CAP_topa_output) &&
2429                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2430                                            PT_CAP_single_range_output))
2431                         return 1;
2432                 if (!pt_output_base_valid(vcpu, data))
2433                         return 1;
2434                 vmx->pt_desc.guest.output_base = data;
2435                 break;
2436         case MSR_IA32_RTIT_OUTPUT_MASK:
2437                 if (!pt_can_write_msr(vmx))
2438                         return 1;
2439                 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2440                                            PT_CAP_topa_output) &&
2441                     !intel_pt_validate_cap(vmx->pt_desc.caps,
2442                                            PT_CAP_single_range_output))
2443                         return 1;
2444                 vmx->pt_desc.guest.output_mask = data;
2445                 break;
2446         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2447                 if (!pt_can_write_msr(vmx))
2448                         return 1;
2449                 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2450                 if (index >= 2 * vmx->pt_desc.num_address_ranges)
2451                         return 1;
2452                 if (is_noncanonical_msr_address(data, vcpu))
2453                         return 1;
2454                 if (index % 2)
2455                         vmx->pt_desc.guest.addr_b[index / 2] = data;
2456                 else
2457                         vmx->pt_desc.guest.addr_a[index / 2] = data;
2458                 break;
2459         case MSR_IA32_PERF_CAPABILITIES:
2460                 if (data & PMU_CAP_LBR_FMT) {
2461                         if ((data & PMU_CAP_LBR_FMT) !=
2462                             (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
2463                                 return 1;
2464                         if (!cpuid_model_is_consistent(vcpu))
2465                                 return 1;
2466                 }
2467                 if (data & PERF_CAP_PEBS_FORMAT) {
2468                         if ((data & PERF_CAP_PEBS_MASK) !=
2469                             (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
2470                                 return 1;
2471                         if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
2472                                 return 1;
2473                         if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
2474                                 return 1;
2475                         if (!cpuid_model_is_consistent(vcpu))
2476                                 return 1;
2477                 }
2478                 ret = kvm_set_msr_common(vcpu, msr_info);
2479                 break;
2480
2481         default:
2482         find_uret_msr:
2483                 msr = vmx_find_uret_msr(vmx, msr_index);
2484                 if (msr)
2485                         ret = vmx_set_guest_uret_msr(vmx, msr, data);
2486                 else
2487                         ret = kvm_set_msr_common(vcpu, msr_info);
2488         }
2489
2490         /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
2491         if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2492                 vmx_update_fb_clear_dis(vcpu, vmx);
2493
2494         return ret;
2495 }
2496
2497 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2498 {
2499         unsigned long guest_owned_bits;
2500
2501         kvm_register_mark_available(vcpu, reg);
2502
2503         switch (reg) {
2504         case VCPU_REGS_RSP:
2505                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2506                 break;
2507         case VCPU_REGS_RIP:
2508                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2509                 break;
2510         case VCPU_EXREG_PDPTR:
2511                 if (enable_ept)
2512                         ept_save_pdptrs(vcpu);
2513                 break;
2514         case VCPU_EXREG_CR0:
2515                 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2516
2517                 vcpu->arch.cr0 &= ~guest_owned_bits;
2518                 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
2519                 break;
2520         case VCPU_EXREG_CR3:
2521                 /*
2522                  * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2523                  * CR3 is loaded into hardware, not the guest's CR3.
2524                  */
2525                 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
2526                         vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2527                 break;
2528         case VCPU_EXREG_CR4:
2529                 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2530
2531                 vcpu->arch.cr4 &= ~guest_owned_bits;
2532                 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
2533                 break;
2534         default:
2535                 KVM_BUG_ON(1, vcpu->kvm);
2536                 break;
2537         }
2538 }
2539
2540 /*
2541  * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2542  * directly instead of going through cpu_has(), to ensure KVM is trapping
2543  * ENCLS whenever it's supported in hardware.  It does not matter whether
2544  * the host OS supports or has enabled SGX.
2545  */
2546 static bool cpu_has_sgx(void)
2547 {
2548         return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2549 }
2550
2551 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
2552 {
2553         u32 vmx_msr_low, vmx_msr_high;
2554         u32 ctl = ctl_min | ctl_opt;
2555
2556         rdmsr(msr, vmx_msr_low, vmx_msr_high);
2557
2558         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2559         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2560
2561         /* Ensure minimum (required) set of control bits are supported. */
2562         if (ctl_min & ~ctl)
2563                 return -EIO;
2564
2565         *result = ctl;
2566         return 0;
2567 }
2568
2569 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
2570 {
2571         u64 allowed;
2572
2573         rdmsrl(msr, allowed);
2574
2575         return  ctl_opt & allowed;
2576 }
2577
2578 static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2579                              struct vmx_capability *vmx_cap)
2580 {
2581         u32 _pin_based_exec_control = 0;
2582         u32 _cpu_based_exec_control = 0;
2583         u32 _cpu_based_2nd_exec_control = 0;
2584         u64 _cpu_based_3rd_exec_control = 0;
2585         u32 _vmexit_control = 0;
2586         u32 _vmentry_control = 0;
2587         u64 basic_msr;
2588         u64 misc_msr;
2589         int i;
2590
2591         /*
2592          * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2593          * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2594          * intercepts writes to PAT and EFER, i.e. never enables those controls.
2595          */
2596         struct {
2597                 u32 entry_control;
2598                 u32 exit_control;
2599         } const vmcs_entry_exit_pairs[] = {
2600                 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,  VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2601                 { VM_ENTRY_LOAD_IA32_PAT,               VM_EXIT_LOAD_IA32_PAT },
2602                 { VM_ENTRY_LOAD_IA32_EFER,              VM_EXIT_LOAD_IA32_EFER },
2603                 { VM_ENTRY_LOAD_BNDCFGS,                VM_EXIT_CLEAR_BNDCFGS },
2604                 { VM_ENTRY_LOAD_IA32_RTIT_CTL,          VM_EXIT_CLEAR_IA32_RTIT_CTL },
2605         };
2606
2607         memset(vmcs_conf, 0, sizeof(*vmcs_conf));
2608
2609         if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2610                                 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2611                                 MSR_IA32_VMX_PROCBASED_CTLS,
2612                                 &_cpu_based_exec_control))
2613                 return -EIO;
2614         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2615                 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2616                                         KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
2617                                         MSR_IA32_VMX_PROCBASED_CTLS2,
2618                                         &_cpu_based_2nd_exec_control))
2619                         return -EIO;
2620         }
2621         if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
2622                 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
2623
2624 #ifndef CONFIG_X86_64
2625         if (!(_cpu_based_2nd_exec_control &
2626                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2627                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2628 #endif
2629
2630         if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2631                 _cpu_based_2nd_exec_control &= ~(
2632                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2633                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2634                                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2635
2636         rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2637                 &vmx_cap->ept, &vmx_cap->vpid);
2638
2639         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2640             vmx_cap->ept) {
2641                 pr_warn_once("EPT CAP should not exist if not support "
2642                                 "1-setting enable EPT VM-execution control\n");
2643
2644                 if (error_on_inconsistent_vmcs_config)
2645                         return -EIO;
2646
2647                 vmx_cap->ept = 0;
2648                 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
2649         }
2650         if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2651             vmx_cap->vpid) {
2652                 pr_warn_once("VPID CAP should not exist if not support "
2653                                 "1-setting enable VPID VM-execution control\n");
2654
2655                 if (error_on_inconsistent_vmcs_config)
2656                         return -EIO;
2657
2658                 vmx_cap->vpid = 0;
2659         }
2660
2661         if (!cpu_has_sgx())
2662                 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2663
2664         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2665                 _cpu_based_3rd_exec_control =
2666                         adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
2667                                               MSR_IA32_VMX_PROCBASED_CTLS3);
2668
2669         if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2670                                 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2671                                 MSR_IA32_VMX_EXIT_CTLS,
2672                                 &_vmexit_control))
2673                 return -EIO;
2674
2675         if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2676                                 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2677                                 MSR_IA32_VMX_PINBASED_CTLS,
2678                                 &_pin_based_exec_control))
2679                 return -EIO;
2680
2681         if (cpu_has_broken_vmx_preemption_timer())
2682                 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2683         if (!(_cpu_based_2nd_exec_control &
2684                 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2685                 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2686
2687         if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2688                                 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2689                                 MSR_IA32_VMX_ENTRY_CTLS,
2690                                 &_vmentry_control))
2691                 return -EIO;
2692
2693         for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
2694                 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
2695                 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
2696
2697                 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
2698                         continue;
2699
2700                 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
2701                              _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
2702
2703                 if (error_on_inconsistent_vmcs_config)
2704                         return -EIO;
2705
2706                 _vmentry_control &= ~n_ctrl;
2707                 _vmexit_control &= ~x_ctrl;
2708         }
2709
2710         /*
2711          * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2712          * can't be used due to an errata where VM Exit may incorrectly clear
2713          * IA32_PERF_GLOBAL_CTRL[34:32].  Workaround the errata by using the
2714          * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2715          */
2716         switch (boot_cpu_data.x86_vfm) {
2717         case INTEL_NEHALEM_EP:  /* AAK155 */
2718         case INTEL_NEHALEM:     /* AAP115 */
2719         case INTEL_WESTMERE:    /* AAT100 */
2720         case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
2721         case INTEL_NEHALEM_EX:  /* BA97 */
2722                 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
2723                 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2724                 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2725                              "does not work properly. Using workaround\n");
2726                 break;
2727         default:
2728                 break;
2729         }
2730
2731         rdmsrl(MSR_IA32_VMX_BASIC, basic_msr);
2732
2733         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2734         if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
2735                 return -EIO;
2736
2737 #ifdef CONFIG_X86_64
2738         /*
2739          * KVM expects to be able to shove all legal physical addresses into
2740          * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
2741          * 0 for processors that support Intel 64 architecture".
2742          */
2743         if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
2744                 return -EIO;
2745 #endif
2746
2747         /* Require Write-Back (WB) memory type for VMCS accesses. */
2748         if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
2749                 return -EIO;
2750
2751         rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
2752
2753         vmcs_conf->basic = basic_msr;
2754         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2755         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2756         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2757         vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
2758         vmcs_conf->vmexit_ctrl         = _vmexit_control;
2759         vmcs_conf->vmentry_ctrl        = _vmentry_control;
2760         vmcs_conf->misc = misc_msr;
2761
2762 #if IS_ENABLED(CONFIG_HYPERV)
2763         if (enlightened_vmcs)
2764                 evmcs_sanitize_exec_ctrls(vmcs_conf);
2765 #endif
2766
2767         return 0;
2768 }
2769
2770 static bool __kvm_is_vmx_supported(void)
2771 {
2772         int cpu = smp_processor_id();
2773
2774         if (!(cpuid_ecx(1) & feature_bit(VMX))) {
2775                 pr_err("VMX not supported by CPU %d\n", cpu);
2776                 return false;
2777         }
2778
2779         if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2780             !this_cpu_has(X86_FEATURE_VMX)) {
2781                 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
2782                 return false;
2783         }
2784
2785         return true;
2786 }
2787
2788 static bool kvm_is_vmx_supported(void)
2789 {
2790         bool supported;
2791
2792         migrate_disable();
2793         supported = __kvm_is_vmx_supported();
2794         migrate_enable();
2795
2796         return supported;
2797 }
2798
2799 int vmx_check_processor_compat(void)
2800 {
2801         int cpu = raw_smp_processor_id();
2802         struct vmcs_config vmcs_conf;
2803         struct vmx_capability vmx_cap;
2804
2805         if (!__kvm_is_vmx_supported())
2806                 return -EIO;
2807
2808         if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
2809                 pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
2810                 return -EIO;
2811         }
2812         if (nested)
2813                 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
2814         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
2815                 pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
2816                 return -EIO;
2817         }
2818         return 0;
2819 }
2820
2821 static int kvm_cpu_vmxon(u64 vmxon_pointer)
2822 {
2823         u64 msr;
2824
2825         cr4_set_bits(X86_CR4_VMXE);
2826
2827         asm goto("1: vmxon %[vmxon_pointer]\n\t"
2828                           _ASM_EXTABLE(1b, %l[fault])
2829                           : : [vmxon_pointer] "m"(vmxon_pointer)
2830                           : : fault);
2831         return 0;
2832
2833 fault:
2834         WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2835                   rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
2836         cr4_clear_bits(X86_CR4_VMXE);
2837
2838         return -EFAULT;
2839 }
2840
2841 int vmx_enable_virtualization_cpu(void)
2842 {
2843         int cpu = raw_smp_processor_id();
2844         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2845         int r;
2846
2847         if (cr4_read_shadow() & X86_CR4_VMXE)
2848                 return -EBUSY;
2849
2850         /*
2851          * This can happen if we hot-added a CPU but failed to allocate
2852          * VP assist page for it.
2853          */
2854         if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
2855                 return -EFAULT;
2856
2857         intel_pt_handle_vmx(1);
2858
2859         r = kvm_cpu_vmxon(phys_addr);
2860         if (r) {
2861                 intel_pt_handle_vmx(0);
2862                 return r;
2863         }
2864
2865         return 0;
2866 }
2867
2868 static void vmclear_local_loaded_vmcss(void)
2869 {
2870         int cpu = raw_smp_processor_id();
2871         struct loaded_vmcs *v, *n;
2872
2873         list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2874                                  loaded_vmcss_on_cpu_link)
2875                 __loaded_vmcs_clear(v);
2876 }
2877
2878 void vmx_disable_virtualization_cpu(void)
2879 {
2880         vmclear_local_loaded_vmcss();
2881
2882         if (kvm_cpu_vmxoff())
2883                 kvm_spurious_fault();
2884
2885         hv_reset_evmcs();
2886
2887         intel_pt_handle_vmx(0);
2888 }
2889
2890 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
2891 {
2892         int node = cpu_to_node(cpu);
2893         struct page *pages;
2894         struct vmcs *vmcs;
2895
2896         pages = __alloc_pages_node(node, flags, 0);
2897         if (!pages)
2898                 return NULL;
2899         vmcs = page_address(pages);
2900         memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
2901
2902         /* KVM supports Enlightened VMCS v1 only */
2903         if (kvm_is_using_evmcs())
2904                 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2905         else
2906                 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
2907
2908         if (shadow)
2909                 vmcs->hdr.shadow_vmcs = 1;
2910         return vmcs;
2911 }
2912
2913 void free_vmcs(struct vmcs *vmcs)
2914 {
2915         free_page((unsigned long)vmcs);
2916 }
2917
2918 /*
2919  * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2920  */
2921 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2922 {
2923         if (!loaded_vmcs->vmcs)
2924                 return;
2925         loaded_vmcs_clear(loaded_vmcs);
2926         free_vmcs(loaded_vmcs->vmcs);
2927         loaded_vmcs->vmcs = NULL;
2928         if (loaded_vmcs->msr_bitmap)
2929                 free_page((unsigned long)loaded_vmcs->msr_bitmap);
2930         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2931 }
2932
2933 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2934 {
2935         loaded_vmcs->vmcs = alloc_vmcs(false);
2936         if (!loaded_vmcs->vmcs)
2937                 return -ENOMEM;
2938
2939         vmcs_clear(loaded_vmcs->vmcs);
2940
2941         loaded_vmcs->shadow_vmcs = NULL;
2942         loaded_vmcs->hv_timer_soft_disabled = false;
2943         loaded_vmcs->cpu = -1;
2944         loaded_vmcs->launched = 0;
2945
2946         if (cpu_has_vmx_msr_bitmap()) {
2947                 loaded_vmcs->msr_bitmap = (unsigned long *)
2948                                 __get_free_page(GFP_KERNEL_ACCOUNT);
2949                 if (!loaded_vmcs->msr_bitmap)
2950                         goto out_vmcs;
2951                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2952         }
2953
2954         memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
2955         memset(&loaded_vmcs->controls_shadow, 0,
2956                 sizeof(struct vmcs_controls_shadow));
2957
2958         return 0;
2959
2960 out_vmcs:
2961         free_loaded_vmcs(loaded_vmcs);
2962         return -ENOMEM;
2963 }
2964
2965 static void free_kvm_area(void)
2966 {
2967         int cpu;
2968
2969         for_each_possible_cpu(cpu) {
2970                 free_vmcs(per_cpu(vmxarea, cpu));
2971                 per_cpu(vmxarea, cpu) = NULL;
2972         }
2973 }
2974
2975 static __init int alloc_kvm_area(void)
2976 {
2977         int cpu;
2978
2979         for_each_possible_cpu(cpu) {
2980                 struct vmcs *vmcs;
2981
2982                 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
2983                 if (!vmcs) {
2984                         free_kvm_area();
2985                         return -ENOMEM;
2986                 }
2987
2988                 /*
2989                  * When eVMCS is enabled, alloc_vmcs_cpu() sets
2990                  * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2991                  * revision_id reported by MSR_IA32_VMX_BASIC.
2992                  *
2993                  * However, even though not explicitly documented by
2994                  * TLFS, VMXArea passed as VMXON argument should
2995                  * still be marked with revision_id reported by
2996                  * physical CPU.
2997                  */
2998                 if (kvm_is_using_evmcs())
2999                         vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
3000
3001                 per_cpu(vmxarea, cpu) = vmcs;
3002         }
3003         return 0;
3004 }
3005
3006 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3007                 struct kvm_segment *save)
3008 {
3009         if (!emulate_invalid_guest_state) {
3010                 /*
3011                  * CS and SS RPL should be equal during guest entry according
3012                  * to VMX spec, but in reality it is not always so. Since vcpu
3013                  * is in the middle of the transition from real mode to
3014                  * protected mode it is safe to assume that RPL 0 is a good
3015                  * default value.
3016                  */
3017                 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3018                         save->selector &= ~SEGMENT_RPL_MASK;
3019                 save->dpl = save->selector & SEGMENT_RPL_MASK;
3020                 save->s = 1;
3021         }
3022         __vmx_set_segment(vcpu, save, seg);
3023 }
3024
3025 static void enter_pmode(struct kvm_vcpu *vcpu)
3026 {
3027         unsigned long flags;
3028         struct vcpu_vmx *vmx = to_vmx(vcpu);
3029
3030         /*
3031          * Update real mode segment cache. It may be not up-to-date if segment
3032          * register was written while vcpu was in a guest mode.
3033          */
3034         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3035         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3036         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3037         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3038         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3039         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3040
3041         vmx->rmode.vm86_active = 0;
3042
3043         __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3044
3045         flags = vmcs_readl(GUEST_RFLAGS);
3046         flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3047         flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3048         vmcs_writel(GUEST_RFLAGS, flags);
3049
3050         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3051                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3052
3053         vmx_update_exception_bitmap(vcpu);
3054
3055         fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3056         fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3057         fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3058         fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3059         fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3060         fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3061 }
3062
3063 static void fix_rmode_seg(int seg, struct kvm_segment *save)
3064 {
3065         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3066         struct kvm_segment var = *save;
3067
3068         var.dpl = 0x3;
3069         if (seg == VCPU_SREG_CS)
3070                 var.type = 0x3;
3071
3072         if (!emulate_invalid_guest_state) {
3073                 var.selector = var.base >> 4;
3074                 var.base = var.base & 0xffff0;
3075                 var.limit = 0xffff;
3076                 var.g = 0;
3077                 var.db = 0;
3078                 var.present = 1;
3079                 var.s = 1;
3080                 var.l = 0;
3081                 var.unusable = 0;
3082                 var.type = 0x3;
3083                 var.avl = 0;
3084                 if (save->base & 0xf)
3085                         pr_warn_once("segment base is not paragraph aligned "
3086                                      "when entering protected mode (seg=%d)", seg);
3087         }
3088
3089         vmcs_write16(sf->selector, var.selector);
3090         vmcs_writel(sf->base, var.base);
3091         vmcs_write32(sf->limit, var.limit);
3092         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3093 }
3094
3095 static void enter_rmode(struct kvm_vcpu *vcpu)
3096 {
3097         unsigned long flags;
3098         struct vcpu_vmx *vmx = to_vmx(vcpu);
3099         struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
3100
3101         /*
3102          * KVM should never use VM86 to virtualize Real Mode when L2 is active,
3103          * as using VM86 is unnecessary if unrestricted guest is enabled, and
3104          * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
3105          * should VM-Fail and KVM should reject userspace attempts to stuff
3106          * CR0.PG=0 when L2 is active.
3107          */
3108         WARN_ON_ONCE(is_guest_mode(vcpu));
3109
3110         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3111         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3112         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3113         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3114         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3115         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3116         vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3117
3118         vmx->rmode.vm86_active = 1;
3119
3120         vmx_segment_cache_clear(vmx);
3121
3122         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
3123         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3124         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3125
3126         flags = vmcs_readl(GUEST_RFLAGS);
3127         vmx->rmode.save_rflags = flags;
3128
3129         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3130
3131         vmcs_writel(GUEST_RFLAGS, flags);
3132         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3133         vmx_update_exception_bitmap(vcpu);
3134
3135         fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3136         fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3137         fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3138         fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3139         fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3140         fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3141 }
3142
3143 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3144 {
3145         struct vcpu_vmx *vmx = to_vmx(vcpu);
3146
3147         /* Nothing to do if hardware doesn't support EFER. */
3148         if (!vmx_find_uret_msr(vmx, MSR_EFER))
3149                 return 0;
3150
3151         vcpu->arch.efer = efer;
3152 #ifdef CONFIG_X86_64
3153         if (efer & EFER_LMA)
3154                 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3155         else
3156                 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
3157 #else
3158         if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3159                 return 1;
3160 #endif
3161
3162         vmx_setup_uret_msrs(vmx);
3163         return 0;
3164 }
3165
3166 #ifdef CONFIG_X86_64
3167
3168 static void enter_lmode(struct kvm_vcpu *vcpu)
3169 {
3170         u32 guest_tr_ar;
3171
3172         vmx_segment_cache_clear(to_vmx(vcpu));
3173
3174         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3175         if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3176                 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3177                                      __func__);
3178                 vmcs_write32(GUEST_TR_AR_BYTES,
3179                              (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3180                              | VMX_AR_TYPE_BUSY_64_TSS);
3181         }
3182         vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3183 }
3184
3185 static void exit_lmode(struct kvm_vcpu *vcpu)
3186 {
3187         vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3188 }
3189
3190 #endif
3191
3192 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
3193 {
3194         struct vcpu_vmx *vmx = to_vmx(vcpu);
3195
3196         /*
3197          * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3198          * the CPU is not required to invalidate guest-physical mappings on
3199          * VM-Entry, even if VPID is disabled.  Guest-physical mappings are
3200          * associated with the root EPT structure and not any particular VPID
3201          * (INVVPID also isn't required to invalidate guest-physical mappings).
3202          */
3203         if (enable_ept) {
3204                 ept_sync_global();
3205         } else if (enable_vpid) {
3206                 if (cpu_has_vmx_invvpid_global()) {
3207                         vpid_sync_vcpu_global();
3208                 } else {
3209                         vpid_sync_vcpu_single(vmx->vpid);
3210                         vpid_sync_vcpu_single(vmx->nested.vpid02);
3211                 }
3212         }
3213 }
3214
3215 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3216 {
3217         if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
3218                 return nested_get_vpid02(vcpu);
3219         return to_vmx(vcpu)->vpid;
3220 }
3221
3222 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3223 {
3224         struct kvm_mmu *mmu = vcpu->arch.mmu;
3225         u64 root_hpa = mmu->root.hpa;
3226
3227         /* No flush required if the current context is invalid. */
3228         if (!VALID_PAGE(root_hpa))
3229                 return;
3230
3231         if (enable_ept)
3232                 ept_sync_context(construct_eptp(vcpu, root_hpa,
3233                                                 mmu->root_role.level));
3234         else
3235                 vpid_sync_context(vmx_get_current_vpid(vcpu));
3236 }
3237
3238 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3239 {
3240         /*
3241          * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
3242          * vmx_flush_tlb_guest() for an explanation of why this is ok.
3243          */
3244         vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
3245 }
3246
3247 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3248 {
3249         /*
3250          * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3251          * vpid couldn't be allocated for this vCPU.  VM-Enter and VM-Exit are
3252          * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
3253          * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3254          * i.e. no explicit INVVPID is necessary.
3255          */
3256         vpid_sync_context(vmx_get_current_vpid(vcpu));
3257 }
3258
3259 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
3260 {
3261         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3262
3263         if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3264                 return;
3265
3266         if (is_pae_paging(vcpu)) {
3267                 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3268                 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3269                 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3270                 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3271         }
3272 }
3273
3274 void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3275 {
3276         struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3277
3278         if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3279                 return;
3280
3281         mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3282         mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3283         mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3284         mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3285
3286         kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
3287 }
3288
3289 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3290                           CPU_BASED_CR3_STORE_EXITING)
3291
3292 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3293 {
3294         if (is_guest_mode(vcpu))
3295                 return nested_guest_cr0_valid(vcpu, cr0);
3296
3297         if (to_vmx(vcpu)->nested.vmxon)
3298                 return nested_host_cr0_valid(vcpu, cr0);
3299
3300         return true;
3301 }
3302
3303 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3304 {
3305         struct vcpu_vmx *vmx = to_vmx(vcpu);
3306         unsigned long hw_cr0, old_cr0_pg;
3307         u32 tmp;
3308
3309         old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3310
3311         hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3312         if (enable_unrestricted_guest)
3313                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3314         else {
3315                 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3316                 if (!enable_ept)
3317                         hw_cr0 |= X86_CR0_WP;
3318
3319                 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3320                         enter_pmode(vcpu);
3321
3322                 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3323                         enter_rmode(vcpu);
3324         }
3325
3326         vmcs_writel(CR0_READ_SHADOW, cr0);
3327         vmcs_writel(GUEST_CR0, hw_cr0);
3328         vcpu->arch.cr0 = cr0;
3329         kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
3330
3331 #ifdef CONFIG_X86_64
3332         if (vcpu->arch.efer & EFER_LME) {
3333                 if (!old_cr0_pg && (cr0 & X86_CR0_PG))
3334                         enter_lmode(vcpu);
3335                 else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
3336                         exit_lmode(vcpu);
3337         }
3338 #endif
3339
3340         if (enable_ept && !enable_unrestricted_guest) {
3341                 /*
3342                  * Ensure KVM has an up-to-date snapshot of the guest's CR3.  If
3343                  * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3344                  * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3345                  * KVM's CR3 is installed.
3346                  */
3347                 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3348                         vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3349
3350                 /*
3351                  * When running with EPT but not unrestricted guest, KVM must
3352                  * intercept CR3 accesses when paging is _disabled_.  This is
3353                  * necessary because restricted guests can't actually run with
3354                  * paging disabled, and so KVM stuffs its own CR3 in order to
3355                  * run the guest when identity mapped page tables.
3356                  *
3357                  * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3358                  * update, it may be stale with respect to CR3 interception,
3359                  * e.g. after nested VM-Enter.
3360                  *
3361                  * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3362                  * stores to forward them to L1, even if KVM does not need to
3363                  * intercept them to preserve its identity mapped page tables.
3364                  */
3365                 if (!(cr0 & X86_CR0_PG)) {
3366                         exec_controls_setbit(vmx, CR3_EXITING_BITS);
3367                 } else if (!is_guest_mode(vcpu)) {
3368                         exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3369                 } else {
3370                         tmp = exec_controls_get(vmx);
3371                         tmp &= ~CR3_EXITING_BITS;
3372                         tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3373                         exec_controls_set(vmx, tmp);
3374                 }
3375
3376                 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
3377                 if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
3378                         vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3379
3380                 /*
3381                  * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3382                  * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3383                  */
3384                 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3385                         kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
3386         }
3387
3388         /* depends on vcpu->arch.cr0 to be set to a new value */
3389         vmx->emulation_required = vmx_emulation_required(vcpu);
3390 }
3391
3392 static int vmx_get_max_ept_level(void)
3393 {
3394         if (cpu_has_vmx_ept_5levels())
3395                 return 5;
3396         return 4;
3397 }
3398
3399 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
3400 {
3401         u64 eptp = VMX_EPTP_MT_WB;
3402
3403         eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3404
3405         if (enable_ept_ad_bits &&
3406             (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
3407                 eptp |= VMX_EPTP_AD_ENABLE_BIT;
3408         eptp |= root_hpa;
3409
3410         return eptp;
3411 }
3412
3413 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
3414 {
3415         struct kvm *kvm = vcpu->kvm;
3416         bool update_guest_cr3 = true;
3417         unsigned long guest_cr3;
3418         u64 eptp;
3419
3420         if (enable_ept) {
3421                 eptp = construct_eptp(vcpu, root_hpa, root_level);
3422                 vmcs_write64(EPT_POINTER, eptp);
3423
3424                 hv_track_root_tdp(vcpu, root_hpa);
3425
3426                 if (!enable_unrestricted_guest && !is_paging(vcpu))
3427                         guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3428                 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
3429                         guest_cr3 = vcpu->arch.cr3;
3430                 else /* vmcs.GUEST_CR3 is already up-to-date. */
3431                         update_guest_cr3 = false;
3432                 vmx_ept_load_pdptrs(vcpu);
3433         } else {
3434                 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
3435                             kvm_get_active_cr3_lam_bits(vcpu);
3436         }
3437
3438         if (update_guest_cr3)
3439                 vmcs_writel(GUEST_CR3, guest_cr3);
3440 }
3441
3442 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3443 {
3444         /*
3445          * We operate under the default treatment of SMM, so VMX cannot be
3446          * enabled under SMM.  Note, whether or not VMXE is allowed at all,
3447          * i.e. is a reserved bit, is handled by common x86 code.
3448          */
3449         if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3450                 return false;
3451
3452         if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3453                 return false;
3454
3455         return true;
3456 }
3457
3458 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3459 {
3460         unsigned long old_cr4 = kvm_read_cr4(vcpu);
3461         struct vcpu_vmx *vmx = to_vmx(vcpu);
3462         unsigned long hw_cr4;
3463
3464         /*
3465          * Pass through host's Machine Check Enable value to hw_cr4, which
3466          * is in force while we are in guest mode.  Do not let guests control
3467          * this bit, even if host CR4.MCE == 0.
3468          */
3469         hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3470         if (enable_unrestricted_guest)
3471                 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3472         else if (vmx->rmode.vm86_active)
3473                 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3474         else
3475                 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3476
3477         if (vmx_umip_emulated()) {
3478                 if (cr4 & X86_CR4_UMIP) {
3479                         secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3480                         hw_cr4 &= ~X86_CR4_UMIP;
3481                 } else if (!is_guest_mode(vcpu) ||
3482                         !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3483                         secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3484                 }
3485         }
3486
3487         vcpu->arch.cr4 = cr4;
3488         kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
3489
3490         if (!enable_unrestricted_guest) {
3491                 if (enable_ept) {
3492                         if (!is_paging(vcpu)) {
3493                                 hw_cr4 &= ~X86_CR4_PAE;
3494                                 hw_cr4 |= X86_CR4_PSE;
3495                         } else if (!(cr4 & X86_CR4_PAE)) {
3496                                 hw_cr4 &= ~X86_CR4_PAE;
3497                         }
3498                 }
3499
3500                 /*
3501                  * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3502                  * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
3503                  * to be manually disabled when guest switches to non-paging
3504                  * mode.
3505                  *
3506                  * If !enable_unrestricted_guest, the CPU is always running
3507                  * with CR0.PG=1 and CR4 needs to be modified.
3508                  * If enable_unrestricted_guest, the CPU automatically
3509                  * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3510                  */
3511                 if (!is_paging(vcpu))
3512                         hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3513         }
3514
3515         vmcs_writel(CR4_READ_SHADOW, cr4);
3516         vmcs_writel(GUEST_CR4, hw_cr4);
3517
3518         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
3519                 kvm_update_cpuid_runtime(vcpu);
3520 }
3521
3522 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3523 {
3524         struct vcpu_vmx *vmx = to_vmx(vcpu);
3525         u32 ar;
3526
3527         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3528                 *var = vmx->rmode.segs[seg];
3529                 if (seg == VCPU_SREG_TR
3530                     || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3531                         return;
3532                 var->base = vmx_read_guest_seg_base(vmx, seg);
3533                 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3534                 return;
3535         }
3536         var->base = vmx_read_guest_seg_base(vmx, seg);
3537         var->limit = vmx_read_guest_seg_limit(vmx, seg);
3538         var->selector = vmx_read_guest_seg_selector(vmx, seg);
3539         ar = vmx_read_guest_seg_ar(vmx, seg);
3540         var->unusable = (ar >> 16) & 1;
3541         var->type = ar & 15;
3542         var->s = (ar >> 4) & 1;
3543         var->dpl = (ar >> 5) & 3;
3544         /*
3545          * Some userspaces do not preserve unusable property. Since usable
3546          * segment has to be present according to VMX spec we can use present
3547          * property to amend userspace bug by making unusable segment always
3548          * nonpresent. vmx_segment_access_rights() already marks nonpresent
3549          * segment as unusable.
3550          */
3551         var->present = !var->unusable;
3552         var->avl = (ar >> 12) & 1;
3553         var->l = (ar >> 13) & 1;
3554         var->db = (ar >> 14) & 1;
3555         var->g = (ar >> 15) & 1;
3556 }
3557
3558 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3559 {
3560         struct kvm_segment s;
3561
3562         if (to_vmx(vcpu)->rmode.vm86_active) {
3563                 vmx_get_segment(vcpu, &s, seg);
3564                 return s.base;
3565         }
3566         return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3567 }
3568
3569 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache)
3570 {
3571         struct vcpu_vmx *vmx = to_vmx(vcpu);
3572         int ar;
3573
3574         if (unlikely(vmx->rmode.vm86_active))
3575                 return 0;
3576
3577         if (no_cache)
3578                 ar = vmcs_read32(GUEST_SS_AR_BYTES);
3579         else
3580                 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3581         return VMX_AR_DPL(ar);
3582 }
3583
3584 int vmx_get_cpl(struct kvm_vcpu *vcpu)
3585 {
3586         return __vmx_get_cpl(vcpu, false);
3587 }
3588
3589 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu)
3590 {
3591         return __vmx_get_cpl(vcpu, true);
3592 }
3593
3594 static u32 vmx_segment_access_rights(struct kvm_segment *var)
3595 {
3596         u32 ar;
3597
3598         ar = var->type & 15;
3599         ar |= (var->s & 1) << 4;
3600         ar |= (var->dpl & 3) << 5;
3601         ar |= (var->present & 1) << 7;
3602         ar |= (var->avl & 1) << 12;
3603         ar |= (var->l & 1) << 13;
3604         ar |= (var->db & 1) << 14;
3605         ar |= (var->g & 1) << 15;
3606         ar |= (var->unusable || !var->present) << 16;
3607
3608         return ar;
3609 }
3610
3611 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3612 {
3613         struct vcpu_vmx *vmx = to_vmx(vcpu);
3614         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3615
3616         vmx_segment_cache_clear(vmx);
3617
3618         if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3619                 vmx->rmode.segs[seg] = *var;
3620                 if (seg == VCPU_SREG_TR)
3621                         vmcs_write16(sf->selector, var->selector);
3622                 else if (var->s)
3623                         fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3624                 return;
3625         }
3626
3627         vmcs_writel(sf->base, var->base);
3628         vmcs_write32(sf->limit, var->limit);
3629         vmcs_write16(sf->selector, var->selector);
3630
3631         /*
3632          *   Fix the "Accessed" bit in AR field of segment registers for older
3633          * qemu binaries.
3634          *   IA32 arch specifies that at the time of processor reset the
3635          * "Accessed" bit in the AR field of segment registers is 1. And qemu
3636          * is setting it to 0 in the userland code. This causes invalid guest
3637          * state vmexit when "unrestricted guest" mode is turned on.
3638          *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3639          * tree. Newer qemu binaries with that qemu fix would not need this
3640          * kvm hack.
3641          */
3642         if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3643                 var->type |= 0x1; /* Accessed */
3644
3645         vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3646 }
3647
3648 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3649 {
3650         __vmx_set_segment(vcpu, var, seg);
3651
3652         to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
3653 }
3654
3655 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3656 {
3657         u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3658
3659         *db = (ar >> 14) & 1;
3660         *l = (ar >> 13) & 1;
3661 }
3662
3663 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3664 {
3665         dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3666         dt->address = vmcs_readl(GUEST_IDTR_BASE);
3667 }
3668
3669 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3670 {
3671         vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3672         vmcs_writel(GUEST_IDTR_BASE, dt->address);
3673 }
3674
3675 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3676 {
3677         dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3678         dt->address = vmcs_readl(GUEST_GDTR_BASE);
3679 }
3680
3681 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3682 {
3683         vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3684         vmcs_writel(GUEST_GDTR_BASE, dt->address);
3685 }
3686
3687 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3688 {
3689         struct kvm_segment var;
3690         u32 ar;
3691
3692         vmx_get_segment(vcpu, &var, seg);
3693         var.dpl = 0x3;
3694         if (seg == VCPU_SREG_CS)
3695                 var.type = 0x3;
3696         ar = vmx_segment_access_rights(&var);
3697
3698         if (var.base != (var.selector << 4))
3699                 return false;
3700         if (var.limit != 0xffff)
3701                 return false;
3702         if (ar != 0xf3)
3703                 return false;
3704
3705         return true;
3706 }
3707
3708 static bool code_segment_valid(struct kvm_vcpu *vcpu)
3709 {
3710         struct kvm_segment cs;
3711         unsigned int cs_rpl;
3712
3713         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3714         cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3715
3716         if (cs.unusable)
3717                 return false;
3718         if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
3719                 return false;
3720         if (!cs.s)
3721                 return false;
3722         if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3723                 if (cs.dpl > cs_rpl)
3724                         return false;
3725         } else {
3726                 if (cs.dpl != cs_rpl)
3727                         return false;
3728         }
3729         if (!cs.present)
3730                 return false;
3731
3732         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3733         return true;
3734 }
3735
3736 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3737 {
3738         struct kvm_segment ss;
3739         unsigned int ss_rpl;
3740
3741         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3742         ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3743
3744         if (ss.unusable)
3745                 return true;
3746         if (ss.type != 3 && ss.type != 7)
3747                 return false;
3748         if (!ss.s)
3749                 return false;
3750         if (ss.dpl != ss_rpl) /* DPL != RPL */
3751                 return false;
3752         if (!ss.present)
3753                 return false;
3754
3755         return true;
3756 }
3757
3758 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3759 {
3760         struct kvm_segment var;
3761         unsigned int rpl;
3762
3763         vmx_get_segment(vcpu, &var, seg);
3764         rpl = var.selector & SEGMENT_RPL_MASK;
3765
3766         if (var.unusable)
3767                 return true;
3768         if (!var.s)
3769                 return false;
3770         if (!var.present)
3771                 return false;
3772         if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
3773                 if (var.dpl < rpl) /* DPL < RPL */
3774                         return false;
3775         }
3776
3777         /* TODO: Add other members to kvm_segment_field to allow checking for other access
3778          * rights flags
3779          */
3780         return true;
3781 }
3782
3783 static bool tr_valid(struct kvm_vcpu *vcpu)
3784 {
3785         struct kvm_segment tr;
3786
3787         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3788
3789         if (tr.unusable)
3790                 return false;
3791         if (tr.selector & SEGMENT_TI_MASK)      /* TI = 1 */
3792                 return false;
3793         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3794                 return false;
3795         if (!tr.present)
3796                 return false;
3797
3798         return true;
3799 }
3800
3801 static bool ldtr_valid(struct kvm_vcpu *vcpu)
3802 {
3803         struct kvm_segment ldtr;
3804
3805         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3806
3807         if (ldtr.unusable)
3808                 return true;
3809         if (ldtr.selector & SEGMENT_TI_MASK)    /* TI = 1 */
3810                 return false;
3811         if (ldtr.type != 2)
3812                 return false;
3813         if (!ldtr.present)
3814                 return false;
3815
3816         return true;
3817 }
3818
3819 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3820 {
3821         struct kvm_segment cs, ss;
3822
3823         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3824         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3825
3826         return ((cs.selector & SEGMENT_RPL_MASK) ==
3827                  (ss.selector & SEGMENT_RPL_MASK));
3828 }
3829
3830 /*
3831  * Check if guest state is valid. Returns true if valid, false if
3832  * not.
3833  * We assume that registers are always usable
3834  */
3835 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3836 {
3837         /* real mode guest state checks */
3838         if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3839                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3840                         return false;
3841                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3842                         return false;
3843                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3844                         return false;
3845                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3846                         return false;
3847                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3848                         return false;
3849                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3850                         return false;
3851         } else {
3852         /* protected mode guest state checks */
3853                 if (!cs_ss_rpl_check(vcpu))
3854                         return false;
3855                 if (!code_segment_valid(vcpu))
3856                         return false;
3857                 if (!stack_segment_valid(vcpu))
3858                         return false;
3859                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3860                         return false;
3861                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3862                         return false;
3863                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3864                         return false;
3865                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3866                         return false;
3867                 if (!tr_valid(vcpu))
3868                         return false;
3869                 if (!ldtr_valid(vcpu))
3870                         return false;
3871         }
3872         /* TODO:
3873          * - Add checks on RIP
3874          * - Add checks on RFLAGS
3875          */
3876
3877         return true;
3878 }
3879
3880 static int init_rmode_tss(struct kvm *kvm, void __user *ua)
3881 {
3882         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3883         u16 data;
3884         int i;
3885
3886         for (i = 0; i < 3; i++) {
3887                 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
3888                         return -EFAULT;
3889         }
3890
3891         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3892         if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
3893                 return -EFAULT;
3894
3895         data = ~0;
3896         if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
3897                 return -EFAULT;
3898
3899         return 0;
3900 }
3901
3902 static int init_rmode_identity_map(struct kvm *kvm)
3903 {
3904         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3905         int i, r = 0;
3906         void __user *uaddr;
3907         u32 tmp;
3908
3909         /* Protect kvm_vmx->ept_identity_pagetable_done. */
3910         mutex_lock(&kvm->slots_lock);
3911
3912         if (likely(kvm_vmx->ept_identity_pagetable_done))
3913                 goto out;
3914
3915         if (!kvm_vmx->ept_identity_map_addr)
3916                 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3917
3918         uaddr = __x86_set_memory_region(kvm,
3919                                         IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3920                                         kvm_vmx->ept_identity_map_addr,
3921                                         PAGE_SIZE);
3922         if (IS_ERR(uaddr)) {
3923                 r = PTR_ERR(uaddr);
3924                 goto out;
3925         }
3926
3927         /* Set up identity-mapping pagetable for EPT in real mode */
3928         for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
3929                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3930                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3931                 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
3932                         r = -EFAULT;
3933                         goto out;
3934                 }
3935         }
3936         kvm_vmx->ept_identity_pagetable_done = true;
3937
3938 out:
3939         mutex_unlock(&kvm->slots_lock);
3940         return r;
3941 }
3942
3943 static void seg_setup(int seg)
3944 {
3945         const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3946         unsigned int ar;
3947
3948         vmcs_write16(sf->selector, 0);
3949         vmcs_writel(sf->base, 0);
3950         vmcs_write32(sf->limit, 0xffff);
3951         ar = 0x93;
3952         if (seg == VCPU_SREG_CS)
3953                 ar |= 0x08; /* code segment */
3954
3955         vmcs_write32(sf->ar_bytes, ar);
3956 }
3957
3958 int allocate_vpid(void)
3959 {
3960         int vpid;
3961
3962         if (!enable_vpid)
3963                 return 0;
3964         spin_lock(&vmx_vpid_lock);
3965         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
3966         if (vpid < VMX_NR_VPIDS)
3967                 __set_bit(vpid, vmx_vpid_bitmap);
3968         else
3969                 vpid = 0;
3970         spin_unlock(&vmx_vpid_lock);
3971         return vpid;
3972 }
3973
3974 void free_vpid(int vpid)
3975 {
3976         if (!enable_vpid || vpid == 0)
3977                 return;
3978         spin_lock(&vmx_vpid_lock);
3979         __clear_bit(vpid, vmx_vpid_bitmap);
3980         spin_unlock(&vmx_vpid_lock);
3981 }
3982
3983 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3984 {
3985         /*
3986          * When KVM is a nested hypervisor on top of Hyper-V and uses
3987          * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3988          * bitmap has changed.
3989          */
3990         if (kvm_is_using_evmcs()) {
3991                 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
3992
3993                 if (evmcs->hv_enlightenments_control.msr_bitmap)
3994                         evmcs->hv_clean_fields &=
3995                                 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
3996         }
3997
3998         vmx->nested.force_msr_bitmap_recalc = true;
3999 }
4000
4001 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
4002 {
4003         struct vcpu_vmx *vmx = to_vmx(vcpu);
4004         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4005         int idx;
4006
4007         if (!cpu_has_vmx_msr_bitmap())
4008                 return;
4009
4010         vmx_msr_bitmap_l01_changed(vmx);
4011
4012         /*
4013          * Mark the desired intercept state in shadow bitmap, this is needed
4014          * for resync when the MSR filters change.
4015          */
4016         idx = vmx_get_passthrough_msr_slot(msr);
4017         if (idx >= 0) {
4018                 if (type & MSR_TYPE_R)
4019                         clear_bit(idx, vmx->shadow_msr_intercept.read);
4020                 if (type & MSR_TYPE_W)
4021                         clear_bit(idx, vmx->shadow_msr_intercept.write);
4022         }
4023
4024         if ((type & MSR_TYPE_R) &&
4025             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
4026                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
4027                 type &= ~MSR_TYPE_R;
4028         }
4029
4030         if ((type & MSR_TYPE_W) &&
4031             !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
4032                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
4033                 type &= ~MSR_TYPE_W;
4034         }
4035
4036         if (type & MSR_TYPE_R)
4037                 vmx_clear_msr_bitmap_read(msr_bitmap, msr);
4038
4039         if (type & MSR_TYPE_W)
4040                 vmx_clear_msr_bitmap_write(msr_bitmap, msr);
4041 }
4042
4043 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
4044 {
4045         struct vcpu_vmx *vmx = to_vmx(vcpu);
4046         unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4047         int idx;
4048
4049         if (!cpu_has_vmx_msr_bitmap())
4050                 return;
4051
4052         vmx_msr_bitmap_l01_changed(vmx);
4053
4054         /*
4055          * Mark the desired intercept state in shadow bitmap, this is needed
4056          * for resync when the MSR filter changes.
4057          */
4058         idx = vmx_get_passthrough_msr_slot(msr);
4059         if (idx >= 0) {
4060                 if (type & MSR_TYPE_R)
4061                         set_bit(idx, vmx->shadow_msr_intercept.read);
4062                 if (type & MSR_TYPE_W)
4063                         set_bit(idx, vmx->shadow_msr_intercept.write);
4064         }
4065
4066         if (type & MSR_TYPE_R)
4067                 vmx_set_msr_bitmap_read(msr_bitmap, msr);
4068
4069         if (type & MSR_TYPE_W)
4070                 vmx_set_msr_bitmap_write(msr_bitmap, msr);
4071 }
4072
4073 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
4074 {
4075         /*
4076          * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
4077          * of the MSR bitmap.  KVM emulates APIC registers up through 0x3f0,
4078          * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
4079          */
4080         const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
4081         const int write_idx = read_idx + (0x800 / sizeof(u64));
4082         struct vcpu_vmx *vmx = to_vmx(vcpu);
4083         u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
4084         u8 mode;
4085
4086         if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
4087                 return;
4088
4089         if (cpu_has_secondary_exec_ctrls() &&
4090             (secondary_exec_controls_get(vmx) &
4091              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4092                 mode = MSR_BITMAP_MODE_X2APIC;
4093                 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4094                         mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4095         } else {
4096                 mode = 0;
4097         }
4098
4099         if (mode == vmx->x2apic_msr_bitmap_mode)
4100                 return;
4101
4102         vmx->x2apic_msr_bitmap_mode = mode;
4103
4104         /*
4105          * Reset the bitmap for MSRs 0x800 - 0x83f.  Leave AMD's uber-extended
4106          * registers (0x840 and above) intercepted, KVM doesn't support them.
4107          * Intercept all writes by default and poke holes as needed.  Pass
4108          * through reads for all valid registers by default in x2APIC+APICv
4109          * mode, only the current timer count needs on-demand emulation by KVM.
4110          */
4111         if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
4112                 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
4113         else
4114                 msr_bitmap[read_idx] = ~0ull;
4115         msr_bitmap[write_idx] = ~0ull;
4116
4117         /*
4118          * TPR reads and writes can be virtualized even if virtual interrupt
4119          * delivery is not in use.
4120          */
4121         vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4122                                   !(mode & MSR_BITMAP_MODE_X2APIC));
4123
4124         if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4125                 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4126                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4127                 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4128                 if (enable_ipiv)
4129                         vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
4130         }
4131 }
4132
4133 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
4134 {
4135         struct vcpu_vmx *vmx = to_vmx(vcpu);
4136         bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4137         u32 i;
4138
4139         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
4140         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
4141         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
4142         vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
4143         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
4144                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4145                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
4146         }
4147 }
4148
4149 void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
4150 {
4151         struct vcpu_vmx *vmx = to_vmx(vcpu);
4152         u32 i;
4153
4154         if (!cpu_has_vmx_msr_bitmap())
4155                 return;
4156
4157         /*
4158          * Redo intercept permissions for MSRs that KVM is passing through to
4159          * the guest.  Disabling interception will check the new MSR filter and
4160          * ensure that KVM enables interception if usersepace wants to filter
4161          * the MSR.  MSRs that KVM is already intercepting don't need to be
4162          * refreshed since KVM is going to intercept them regardless of what
4163          * userspace wants.
4164          */
4165         for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
4166                 u32 msr = vmx_possible_passthrough_msrs[i];
4167
4168                 if (!test_bit(i, vmx->shadow_msr_intercept.read))
4169                         vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
4170
4171                 if (!test_bit(i, vmx->shadow_msr_intercept.write))
4172                         vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
4173         }
4174
4175         /* PT MSRs can be passed through iff PT is exposed to the guest. */
4176         if (vmx_pt_mode_is_host_guest())
4177                 pt_update_intercept_for_msr(vcpu);
4178 }
4179
4180 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
4181                                                      int pi_vec)
4182 {
4183 #ifdef CONFIG_SMP
4184         if (vcpu->mode == IN_GUEST_MODE) {
4185                 /*
4186                  * The vector of the virtual has already been set in the PIR.
4187                  * Send a notification event to deliver the virtual interrupt
4188                  * unless the vCPU is the currently running vCPU, i.e. the
4189                  * event is being sent from a fastpath VM-Exit handler, in
4190                  * which case the PIR will be synced to the vIRR before
4191                  * re-entering the guest.
4192                  *
4193                  * When the target is not the running vCPU, the following
4194                  * possibilities emerge:
4195                  *
4196                  * Case 1: vCPU stays in non-root mode. Sending a notification
4197                  * event posts the interrupt to the vCPU.
4198                  *
4199                  * Case 2: vCPU exits to root mode and is still runnable. The
4200                  * PIR will be synced to the vIRR before re-entering the guest.
4201                  * Sending a notification event is ok as the host IRQ handler
4202                  * will ignore the spurious event.
4203                  *
4204                  * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
4205                  * has already synced PIR to vIRR and never blocks the vCPU if
4206                  * the vIRR is not empty. Therefore, a blocked vCPU here does
4207                  * not wait for any requested interrupts in PIR, and sending a
4208                  * notification event also results in a benign, spurious event.
4209                  */
4210
4211                 if (vcpu != kvm_get_running_vcpu())
4212                         __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
4213                 return;
4214         }
4215 #endif
4216         /*
4217          * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
4218          * otherwise do nothing as KVM will grab the highest priority pending
4219          * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
4220          */
4221         kvm_vcpu_wake_up(vcpu);
4222 }
4223
4224 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4225                                                 int vector)
4226 {
4227         struct vcpu_vmx *vmx = to_vmx(vcpu);
4228
4229         /*
4230          * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
4231          * and freed, and must not be accessed outside of vcpu->mutex.  The
4232          * vCPU's cached PI NV is valid if and only if posted interrupts
4233          * enabled in its vmcs12, i.e. checking the vector also checks that
4234          * L1 has enabled posted interrupts for L2.
4235          */
4236         if (is_guest_mode(vcpu) &&
4237             vector == vmx->nested.posted_intr_nv) {
4238                 /*
4239                  * If a posted intr is not recognized by hardware,
4240                  * we will accomplish it in the next vmentry.
4241                  */
4242                 vmx->nested.pi_pending = true;
4243                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4244
4245                 /*
4246                  * This pairs with the smp_mb_*() after setting vcpu->mode in
4247                  * vcpu_enter_guest() to guarantee the vCPU sees the event
4248                  * request if triggering a posted interrupt "fails" because
4249                  * vcpu->mode != IN_GUEST_MODE.  The extra barrier is needed as
4250                  * the smb_wmb() in kvm_make_request() only ensures everything
4251                  * done before making the request is visible when the request
4252                  * is visible, it doesn't ensure ordering between the store to
4253                  * vcpu->requests and the load from vcpu->mode.
4254                  */
4255                 smp_mb__after_atomic();
4256
4257                 /* the PIR and ON have been set by L1. */
4258                 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
4259                 return 0;
4260         }
4261         return -1;
4262 }
4263 /*
4264  * Send interrupt to vcpu via posted interrupt way.
4265  * 1. If target vcpu is running(non-root mode), send posted interrupt
4266  * notification to vcpu and hardware will sync PIR to vIRR atomically.
4267  * 2. If target vcpu isn't running(root mode), kick it to pick up the
4268  * interrupt from PIR in next vmentry.
4269  */
4270 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4271 {
4272         struct vcpu_vmx *vmx = to_vmx(vcpu);
4273         int r;
4274
4275         r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4276         if (!r)
4277                 return 0;
4278
4279         /* Note, this is called iff the local APIC is in-kernel. */
4280         if (!vcpu->arch.apic->apicv_active)
4281                 return -1;
4282
4283         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4284                 return 0;
4285
4286         /* If a previous notification has sent the IPI, nothing to do.  */
4287         if (pi_test_and_set_on(&vmx->pi_desc))
4288                 return 0;
4289
4290         /*
4291          * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
4292          * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
4293          * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
4294          * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
4295          */
4296         kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
4297         return 0;
4298 }
4299
4300 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
4301                            int trig_mode, int vector)
4302 {
4303         struct kvm_vcpu *vcpu = apic->vcpu;
4304
4305         if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4306                 kvm_lapic_set_irr(vector, apic);
4307                 kvm_make_request(KVM_REQ_EVENT, vcpu);
4308                 kvm_vcpu_kick(vcpu);
4309         } else {
4310                 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4311                                            trig_mode, vector);
4312         }
4313 }
4314
4315 /*
4316  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4317  * will not change in the lifetime of the guest.
4318  * Note that host-state that does change is set elsewhere. E.g., host-state
4319  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4320  */
4321 void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4322 {
4323         u32 low32, high32;
4324         unsigned long tmpl;
4325         unsigned long cr0, cr3, cr4;
4326
4327         cr0 = read_cr0();
4328         WARN_ON(cr0 & X86_CR0_TS);
4329         vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
4330
4331         /*
4332          * Save the most likely value for this task's CR3 in the VMCS.
4333          * We can't use __get_current_cr3_fast() because we're not atomic.
4334          */
4335         cr3 = __read_cr3();
4336         vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
4337         vmx->loaded_vmcs->host_state.cr3 = cr3;
4338
4339         /* Save the most likely value for this task's CR4 in the VMCS. */
4340         cr4 = cr4_read_shadow();
4341         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
4342         vmx->loaded_vmcs->host_state.cr4 = cr4;
4343
4344         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
4345 #ifdef CONFIG_X86_64
4346         /*
4347          * Load null selectors, so we can avoid reloading them in
4348          * vmx_prepare_switch_to_host(), in case userspace uses
4349          * the null selectors too (the expected case).
4350          */
4351         vmcs_write16(HOST_DS_SELECTOR, 0);
4352         vmcs_write16(HOST_ES_SELECTOR, 0);
4353 #else
4354         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4355         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4356 #endif
4357         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4358         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
4359
4360         vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
4361
4362         vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4363
4364         rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4365         vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4366
4367         /*
4368          * SYSENTER is used for 32-bit system calls on either 32-bit or
4369          * 64-bit kernels.  It is always zero If neither is allowed, otherwise
4370          * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4371          * have already done so!).
4372          */
4373         if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4374                 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
4375
4376         rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4377         vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
4378
4379         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4380                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
4381                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4382         }
4383
4384         if (cpu_has_load_ia32_efer())
4385                 vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
4386 }
4387
4388 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4389 {
4390         struct kvm_vcpu *vcpu = &vmx->vcpu;
4391
4392         vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4393                                           ~vcpu->arch.cr4_guest_rsvd_bits;
4394         if (!enable_ept) {
4395                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
4396                 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4397         }
4398         if (is_guest_mode(&vmx->vcpu))
4399                 vcpu->arch.cr4_guest_owned_bits &=
4400                         ~get_vmcs12(vcpu)->cr4_guest_host_mask;
4401         vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
4402 }
4403
4404 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4405 {
4406         u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4407
4408         if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4409                 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4410
4411         if (!enable_vnmi)
4412                 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4413
4414         if (!enable_preemption_timer)
4415                 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4416
4417         return pin_based_exec_ctrl;
4418 }
4419
4420 static u32 vmx_vmentry_ctrl(void)
4421 {
4422         u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4423
4424         if (vmx_pt_mode_is_system())
4425                 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
4426                                   VM_ENTRY_LOAD_IA32_RTIT_CTL);
4427         /*
4428          * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4429          */
4430         vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
4431                           VM_ENTRY_LOAD_IA32_EFER |
4432                           VM_ENTRY_IA32E_MODE);
4433
4434         return vmentry_ctrl;
4435 }
4436
4437 static u32 vmx_vmexit_ctrl(void)
4438 {
4439         u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4440
4441         /*
4442          * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4443          * nested virtualization and thus allowed to be set in vmcs12.
4444          */
4445         vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
4446                          VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4447
4448         if (vmx_pt_mode_is_system())
4449                 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
4450                                  VM_EXIT_CLEAR_IA32_RTIT_CTL);
4451         /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
4452         return vmexit_ctrl &
4453                 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
4454 }
4455
4456 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4457 {
4458         struct vcpu_vmx *vmx = to_vmx(vcpu);
4459
4460         if (is_guest_mode(vcpu)) {
4461                 vmx->nested.update_vmcs01_apicv_status = true;
4462                 return;
4463         }
4464
4465         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4466
4467         if (kvm_vcpu_apicv_active(vcpu)) {
4468                 secondary_exec_controls_setbit(vmx,
4469                                                SECONDARY_EXEC_APIC_REGISTER_VIRT |
4470                                                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4471                 if (enable_ipiv)
4472                         tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4473         } else {
4474                 secondary_exec_controls_clearbit(vmx,
4475                                                  SECONDARY_EXEC_APIC_REGISTER_VIRT |
4476                                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4477                 if (enable_ipiv)
4478                         tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4479         }
4480
4481         vmx_update_msr_bitmap_x2apic(vcpu);
4482 }
4483
4484 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4485 {
4486         u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4487
4488         /*
4489          * Not used by KVM, but fully supported for nesting, i.e. are allowed in
4490          * vmcs12 and propagated to vmcs02 when set in vmcs12.
4491          */
4492         exec_control &= ~(CPU_BASED_RDTSC_EXITING |
4493                           CPU_BASED_USE_IO_BITMAPS |
4494                           CPU_BASED_MONITOR_TRAP_FLAG |
4495                           CPU_BASED_PAUSE_EXITING);
4496
4497         /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
4498         exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
4499                           CPU_BASED_NMI_WINDOW_EXITING);
4500
4501         if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4502                 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4503
4504         if (!cpu_need_tpr_shadow(&vmx->vcpu))
4505                 exec_control &= ~CPU_BASED_TPR_SHADOW;
4506
4507 #ifdef CONFIG_X86_64
4508         if (exec_control & CPU_BASED_TPR_SHADOW)
4509                 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
4510                                   CPU_BASED_CR8_STORE_EXITING);
4511         else
4512                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
4513                                 CPU_BASED_CR8_LOAD_EXITING;
4514 #endif
4515         /* No need to intercept CR3 access or INVPLG when using EPT. */
4516         if (enable_ept)
4517                 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
4518                                   CPU_BASED_CR3_STORE_EXITING |
4519                                   CPU_BASED_INVLPG_EXITING);
4520         if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4521                 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4522                                 CPU_BASED_MONITOR_EXITING);
4523         if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4524                 exec_control &= ~CPU_BASED_HLT_EXITING;
4525         return exec_control;
4526 }
4527
4528 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4529 {
4530         u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4531
4532         /*
4533          * IPI virtualization relies on APICv. Disable IPI virtualization if
4534          * APICv is inhibited.
4535          */
4536         if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
4537                 exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4538
4539         return exec_control;
4540 }
4541
4542 /*
4543  * Adjust a single secondary execution control bit to intercept/allow an
4544  * instruction in the guest.  This is usually done based on whether or not a
4545  * feature has been exposed to the guest in order to correctly emulate faults.
4546  */
4547 static inline void
4548 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
4549                                   u32 control, bool enabled, bool exiting)
4550 {
4551         /*
4552          * If the control is for an opt-in feature, clear the control if the
4553          * feature is not exposed to the guest, i.e. not enabled.  If the
4554          * control is opt-out, i.e. an exiting control, clear the control if
4555          * the feature _is_ exposed to the guest, i.e. exiting/interception is
4556          * disabled for the associated instruction.  Note, the caller is
4557          * responsible presetting exec_control to set all supported bits.
4558          */
4559         if (enabled == exiting)
4560                 *exec_control &= ~control;
4561
4562         /*
4563          * Update the nested MSR settings so that a nested VMM can/can't set
4564          * controls for features that are/aren't exposed to the guest.
4565          */
4566         if (nested &&
4567             kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
4568                 /*
4569                  * All features that can be added or removed to VMX MSRs must
4570                  * be supported in the first place for nested virtualization.
4571                  */
4572                 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4573                         enabled = false;
4574
4575                 if (enabled)
4576                         vmx->nested.msrs.secondary_ctls_high |= control;
4577                 else
4578                         vmx->nested.msrs.secondary_ctls_high &= ~control;
4579         }
4580 }
4581
4582 /*
4583  * Wrapper macro for the common case of adjusting a secondary execution control
4584  * based on a single guest CPUID bit, with a dedicated feature bit.  This also
4585  * verifies that the control is actually supported by KVM and hardware.
4586  */
4587 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting)     \
4588 ({                                                                                              \
4589         struct kvm_vcpu *__vcpu = &(vmx)->vcpu;                                                 \
4590         bool __enabled;                                                                         \
4591                                                                                                 \
4592         if (cpu_has_vmx_##name()) {                                                             \
4593                 if (kvm_is_governed_feature(X86_FEATURE_##feat_name))                           \
4594                         __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name);             \
4595                 else                                                                            \
4596                         __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name);           \
4597                 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
4598                                                   __enabled, exiting);                          \
4599         }                                                                                       \
4600 })
4601
4602 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4603 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4604         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4605
4606 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4607         vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4608
4609 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4610 {
4611         struct kvm_vcpu *vcpu = &vmx->vcpu;
4612
4613         u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4614
4615         if (vmx_pt_mode_is_system())
4616                 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4617         if (!cpu_need_virtualize_apic_accesses(vcpu))
4618                 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4619         if (vmx->vpid == 0)
4620                 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4621         if (!enable_ept) {
4622                 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4623                 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
4624                 enable_unrestricted_guest = 0;
4625         }
4626         if (!enable_unrestricted_guest)
4627                 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4628         if (kvm_pause_in_guest(vmx->vcpu.kvm))
4629                 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4630         if (!kvm_vcpu_apicv_active(vcpu))
4631                 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4632                                   SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4633         exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4634
4635         /*
4636          * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
4637          * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
4638          */
4639         exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
4640
4641         /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4642          * in vmx_set_cr4.  */
4643         exec_control &= ~SECONDARY_EXEC_DESC;
4644
4645         /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4646            (handle_vmptrld).
4647            We can NOT enable shadow_vmcs here because we don't have yet
4648            a current VMCS12
4649         */
4650         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4651
4652         /*
4653          * PML is enabled/disabled when dirty logging of memsmlots changes, but
4654          * it needs to be set here when dirty logging is already active, e.g.
4655          * if this vCPU was created after dirty logging was enabled.
4656          */
4657         if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
4658                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4659
4660         vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
4661
4662         /*
4663          * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4664          * feature is exposed to the guest.  This creates a virtualization hole
4665          * if both are supported in hardware but only one is exposed to the
4666          * guest, but letting the guest execute RDTSCP or RDPID when either one
4667          * is advertised is preferable to emulating the advertised instruction
4668          * in KVM on #UD, and obviously better than incorrectly injecting #UD.
4669          */
4670         if (cpu_has_vmx_rdtscp()) {
4671                 bool rdpid_or_rdtscp_enabled =
4672                         guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
4673                         guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
4674
4675                 vmx_adjust_secondary_exec_control(vmx, &exec_control,
4676                                                   SECONDARY_EXEC_ENABLE_RDTSCP,
4677                                                   rdpid_or_rdtscp_enabled, false);
4678         }
4679
4680         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4681
4682         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4683         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4684
4685         vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4686                                     ENABLE_USR_WAIT_PAUSE, false);
4687
4688         if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4689                 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4690
4691         if (!kvm_notify_vmexit_enabled(vcpu->kvm))
4692                 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4693
4694         return exec_control;
4695 }
4696
4697 static inline int vmx_get_pid_table_order(struct kvm *kvm)
4698 {
4699         return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4700 }
4701
4702 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4703 {
4704         struct page *pages;
4705         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4706
4707         if (!irqchip_in_kernel(kvm) || !enable_ipiv)
4708                 return 0;
4709
4710         if (kvm_vmx->pid_table)
4711                 return 0;
4712
4713         pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
4714                             vmx_get_pid_table_order(kvm));
4715         if (!pages)
4716                 return -ENOMEM;
4717
4718         kvm_vmx->pid_table = (void *)page_address(pages);
4719         return 0;
4720 }
4721
4722 int vmx_vcpu_precreate(struct kvm *kvm)
4723 {
4724         return vmx_alloc_ipiv_pid_table(kvm);
4725 }
4726
4727 #define VMX_XSS_EXIT_BITMAP 0
4728
4729 static void init_vmcs(struct vcpu_vmx *vmx)
4730 {
4731         struct kvm *kvm = vmx->vcpu.kvm;
4732         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4733
4734         if (nested)
4735                 nested_vmx_set_vmcs_shadowing_bitmap();
4736
4737         if (cpu_has_vmx_msr_bitmap())
4738                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4739
4740         vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
4741
4742         /* Control */
4743         pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4744
4745         exec_controls_set(vmx, vmx_exec_control(vmx));
4746
4747         if (cpu_has_secondary_exec_ctrls()) {
4748                 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
4749                 if (vmx->ve_info)
4750                         vmcs_write64(VE_INFORMATION_ADDRESS,
4751                                      __pa(vmx->ve_info));
4752         }
4753
4754         if (cpu_has_tertiary_exec_ctrls())
4755                 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
4756
4757         if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
4758                 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4759                 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4760                 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4761                 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4762
4763                 vmcs_write16(GUEST_INTR_STATUS, 0);
4764
4765                 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4766                 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4767         }
4768
4769         if (vmx_can_use_ipiv(&vmx->vcpu)) {
4770                 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4771                 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
4772         }
4773
4774         if (!kvm_pause_in_guest(kvm)) {
4775                 vmcs_write32(PLE_GAP, ple_gap);
4776                 vmx->ple_window = ple_window;
4777                 vmx->ple_window_dirty = true;
4778         }
4779
4780         if (kvm_notify_vmexit_enabled(kvm))
4781                 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
4782
4783         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4784         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4785         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4786
4787         vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4788         vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4789         vmx_set_constant_host_state(vmx);
4790         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4791         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4792
4793         if (cpu_has_vmx_vmfunc())
4794                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4795
4796         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4797         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4798         vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4799         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4800         vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4801
4802         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4803                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
4804
4805         vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
4806
4807         /* 22.2.1, 20.8.1 */
4808         vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
4809
4810         vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4811         vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
4812
4813         set_cr4_guest_host_mask(vmx);
4814
4815         if (vmx->vpid != 0)
4816                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4817
4818         if (cpu_has_vmx_xsaves())
4819                 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4820
4821         if (enable_pml) {
4822                 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4823                 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4824         }
4825
4826         vmx_write_encls_bitmap(&vmx->vcpu, NULL);
4827
4828         if (vmx_pt_mode_is_host_guest()) {
4829                 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4830                 /* Bit[6~0] are forced to 1, writes are ignored. */
4831                 vmx->pt_desc.guest.output_mask = 0x7F;
4832                 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4833         }
4834
4835         vmcs_write32(GUEST_SYSENTER_CS, 0);
4836         vmcs_writel(GUEST_SYSENTER_ESP, 0);
4837         vmcs_writel(GUEST_SYSENTER_EIP, 0);
4838         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4839
4840         if (cpu_has_vmx_tpr_shadow()) {
4841                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4842                 if (cpu_need_tpr_shadow(&vmx->vcpu))
4843                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4844                                      __pa(vmx->vcpu.arch.apic->regs));
4845                 vmcs_write32(TPR_THRESHOLD, 0);
4846         }
4847
4848         vmx_setup_uret_msrs(vmx);
4849 }
4850
4851 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4852 {
4853         struct vcpu_vmx *vmx = to_vmx(vcpu);
4854
4855         init_vmcs(vmx);
4856
4857         if (nested &&
4858             kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
4859                 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4860
4861         vcpu_setup_sgx_lepubkeyhash(vcpu);
4862
4863         vmx->nested.posted_intr_nv = -1;
4864         vmx->nested.vmxon_ptr = INVALID_GPA;
4865         vmx->nested.current_vmptr = INVALID_GPA;
4866
4867 #ifdef CONFIG_KVM_HYPERV
4868         vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4869 #endif
4870
4871         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
4872                 vcpu->arch.microcode_version = 0x100000000ULL;
4873         vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4874
4875         /*
4876          * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4877          * or POSTED_INTR_WAKEUP_VECTOR.
4878          */
4879         vmx->pi_desc.nv = POSTED_INTR_VECTOR;
4880         __pi_set_sn(&vmx->pi_desc);
4881 }
4882
4883 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4884 {
4885         struct vcpu_vmx *vmx = to_vmx(vcpu);
4886
4887         if (!init_event)
4888                 __vmx_vcpu_reset(vcpu);
4889
4890         vmx->rmode.vm86_active = 0;
4891         vmx->spec_ctrl = 0;
4892
4893         vmx->msr_ia32_umwait_control = 0;
4894
4895         vmx->hv_deadline_tsc = -1;
4896         kvm_set_cr8(vcpu, 0);
4897
4898         seg_setup(VCPU_SREG_CS);
4899         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4900         vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
4901
4902         seg_setup(VCPU_SREG_DS);
4903         seg_setup(VCPU_SREG_ES);
4904         seg_setup(VCPU_SREG_FS);
4905         seg_setup(VCPU_SREG_GS);
4906         seg_setup(VCPU_SREG_SS);
4907
4908         vmcs_write16(GUEST_TR_SELECTOR, 0);
4909         vmcs_writel(GUEST_TR_BASE, 0);
4910         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4911         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4912
4913         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4914         vmcs_writel(GUEST_LDTR_BASE, 0);
4915         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4916         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4917
4918         vmcs_writel(GUEST_GDTR_BASE, 0);
4919         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4920
4921         vmcs_writel(GUEST_IDTR_BASE, 0);
4922         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4923
4924         vmx_segment_cache_clear(vmx);
4925         kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
4926
4927         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4928         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4929         vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4930         if (kvm_mpx_supported())
4931                 vmcs_write64(GUEST_BNDCFGS, 0);
4932
4933         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4934
4935         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4936
4937         vpid_sync_context(vmx->vpid);
4938
4939         vmx_update_fb_clear_dis(vcpu, vmx);
4940 }
4941
4942 void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
4943 {
4944         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
4945 }
4946
4947 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
4948 {
4949         if (!enable_vnmi ||
4950             vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4951                 vmx_enable_irq_window(vcpu);
4952                 return;
4953         }
4954
4955         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
4956 }
4957
4958 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
4959 {
4960         struct vcpu_vmx *vmx = to_vmx(vcpu);
4961         uint32_t intr;
4962         int irq = vcpu->arch.interrupt.nr;
4963
4964         trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
4965
4966         ++vcpu->stat.irq_injections;
4967         if (vmx->rmode.vm86_active) {
4968                 int inc_eip = 0;
4969                 if (vcpu->arch.interrupt.soft)
4970                         inc_eip = vcpu->arch.event_exit_inst_len;
4971                 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
4972                 return;
4973         }
4974         intr = irq | INTR_INFO_VALID_MASK;
4975         if (vcpu->arch.interrupt.soft) {
4976                 intr |= INTR_TYPE_SOFT_INTR;
4977                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4978                              vmx->vcpu.arch.event_exit_inst_len);
4979         } else
4980                 intr |= INTR_TYPE_EXT_INTR;
4981         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4982
4983         vmx_clear_hlt(vcpu);
4984 }
4985
4986 void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4987 {
4988         struct vcpu_vmx *vmx = to_vmx(vcpu);
4989
4990         if (!enable_vnmi) {
4991                 /*
4992                  * Tracking the NMI-blocked state in software is built upon
4993                  * finding the next open IRQ window. This, in turn, depends on
4994                  * well-behaving guests: They have to keep IRQs disabled at
4995                  * least as long as the NMI handler runs. Otherwise we may
4996                  * cause NMI nesting, maybe breaking the guest. But as this is
4997                  * highly unlikely, we can live with the residual risk.
4998                  */
4999                 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
5000                 vmx->loaded_vmcs->vnmi_blocked_time = 0;
5001         }
5002
5003         ++vcpu->stat.nmi_injections;
5004         vmx->loaded_vmcs->nmi_known_unmasked = false;
5005
5006         if (vmx->rmode.vm86_active) {
5007                 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
5008                 return;
5009         }
5010
5011         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5012                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
5013
5014         vmx_clear_hlt(vcpu);
5015 }
5016
5017 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
5018 {
5019         struct vcpu_vmx *vmx = to_vmx(vcpu);
5020         bool masked;
5021
5022         if (!enable_vnmi)
5023                 return vmx->loaded_vmcs->soft_vnmi_blocked;
5024         if (vmx->loaded_vmcs->nmi_known_unmasked)
5025                 return false;
5026         masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
5027         vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5028         return masked;
5029 }
5030
5031 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
5032 {
5033         struct vcpu_vmx *vmx = to_vmx(vcpu);
5034
5035         if (!enable_vnmi) {
5036                 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5037                         vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5038                         vmx->loaded_vmcs->vnmi_blocked_time = 0;
5039                 }
5040         } else {
5041                 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5042                 if (masked)
5043                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5044                                       GUEST_INTR_STATE_NMI);
5045                 else
5046                         vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5047                                         GUEST_INTR_STATE_NMI);
5048         }
5049 }
5050
5051 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5052 {
5053         if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5054                 return false;
5055
5056         if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5057                 return true;
5058
5059         return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5060                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
5061                  GUEST_INTR_STATE_NMI));
5062 }
5063
5064 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5065 {
5066         if (to_vmx(vcpu)->nested.nested_run_pending)
5067                 return -EBUSY;
5068
5069         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
5070         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5071                 return -EBUSY;
5072
5073         return !vmx_nmi_blocked(vcpu);
5074 }
5075
5076 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5077 {
5078         return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
5079                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5080                 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5081 }
5082
5083 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5084 {
5085         if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5086                 return false;
5087
5088         return __vmx_interrupt_blocked(vcpu);
5089 }
5090
5091 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5092 {
5093         if (to_vmx(vcpu)->nested.nested_run_pending)
5094                 return -EBUSY;
5095
5096         /*
5097          * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5098          * e.g. if the IRQ arrived asynchronously after checking nested events.
5099          */
5100         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5101                 return -EBUSY;
5102
5103         return !vmx_interrupt_blocked(vcpu);
5104 }
5105
5106 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5107 {
5108         void __user *ret;
5109
5110         if (enable_unrestricted_guest)
5111                 return 0;
5112
5113         mutex_lock(&kvm->slots_lock);
5114         ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5115                                       PAGE_SIZE * 3);
5116         mutex_unlock(&kvm->slots_lock);
5117
5118         if (IS_ERR(ret))
5119                 return PTR_ERR(ret);
5120
5121         to_kvm_vmx(kvm)->tss_addr = addr;
5122
5123         return init_rmode_tss(kvm, ret);
5124 }
5125
5126 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5127 {
5128         to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5129         return 0;
5130 }
5131
5132 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5133 {
5134         switch (vec) {
5135         case BP_VECTOR:
5136                 /*
5137                  * Update instruction length as we may reinject the exception
5138                  * from user space while in guest debugging mode.
5139                  */
5140                 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5141                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5142                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5143                         return false;
5144                 fallthrough;
5145         case DB_VECTOR:
5146                 return !(vcpu->guest_debug &
5147                         (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
5148         case DE_VECTOR:
5149         case OF_VECTOR:
5150         case BR_VECTOR:
5151         case UD_VECTOR:
5152         case DF_VECTOR:
5153         case SS_VECTOR:
5154         case GP_VECTOR:
5155         case MF_VECTOR:
5156                 return true;
5157         }
5158         return false;
5159 }
5160
5161 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5162                                   int vec, u32 err_code)
5163 {
5164         /*
5165          * Instruction with address size override prefix opcode 0x67
5166          * Cause the #SS fault with 0 error code in VM86 mode.
5167          */
5168         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5169                 if (kvm_emulate_instruction(vcpu, 0)) {
5170                         if (vcpu->arch.halt_request) {
5171                                 vcpu->arch.halt_request = 0;
5172                                 return kvm_emulate_halt_noskip(vcpu);
5173                         }
5174                         return 1;
5175                 }
5176                 return 0;
5177         }
5178
5179         /*
5180          * Forward all other exceptions that are valid in real mode.
5181          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5182          *        the required debugging infrastructure rework.
5183          */
5184         kvm_queue_exception(vcpu, vec);
5185         return 1;
5186 }
5187
5188 static int handle_machine_check(struct kvm_vcpu *vcpu)
5189 {
5190         /* handled by vmx_vcpu_run() */
5191         return 1;
5192 }
5193
5194 /*
5195  * If the host has split lock detection disabled, then #AC is
5196  * unconditionally injected into the guest, which is the pre split lock
5197  * detection behaviour.
5198  *
5199  * If the host has split lock detection enabled then #AC is
5200  * only injected into the guest when:
5201  *  - Guest CPL == 3 (user mode)
5202  *  - Guest has #AC detection enabled in CR0
5203  *  - Guest EFLAGS has AC bit set
5204  */
5205 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
5206 {
5207         if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5208                 return true;
5209
5210         return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
5211                (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5212 }
5213
5214 static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5215 {
5216         struct vcpu_vmx *vmx = to_vmx(vcpu);
5217         struct kvm_run *kvm_run = vcpu->run;
5218         u32 intr_info, ex_no, error_code;
5219         unsigned long cr2, dr6;
5220         u32 vect_info;
5221
5222         vect_info = vmx->idt_vectoring_info;
5223         intr_info = vmx_get_intr_info(vcpu);
5224
5225         /*
5226          * Machine checks are handled by handle_exception_irqoff(), or by
5227          * vmx_vcpu_run() if a #MC occurs on VM-Entry.  NMIs are handled by
5228          * vmx_vcpu_enter_exit().
5229          */
5230         if (is_machine_check(intr_info) || is_nmi(intr_info))
5231                 return 1;
5232
5233         /*
5234          * Queue the exception here instead of in handle_nm_fault_irqoff().
5235          * This ensures the nested_vmx check is not skipped so vmexit can
5236          * be reflected to L1 (when it intercepts #NM) before reaching this
5237          * point.
5238          */
5239         if (is_nm_fault(intr_info)) {
5240                 kvm_queue_exception(vcpu, NM_VECTOR);
5241                 return 1;
5242         }
5243
5244         if (is_invalid_opcode(intr_info))
5245                 return handle_ud(vcpu);
5246
5247         if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
5248                 struct vmx_ve_information *ve_info = vmx->ve_info;
5249
5250                 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
5251                           "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
5252                 dump_vmcs(vcpu);
5253                 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
5254                 return 1;
5255         }
5256
5257         error_code = 0;
5258         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5259                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5260
5261         if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5262                 WARN_ON_ONCE(!enable_vmware_backdoor);
5263
5264                 /*
5265                  * VMware backdoor emulation on #GP interception only handles
5266                  * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5267                  * error code on #GP.
5268                  */
5269                 if (error_code) {
5270                         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5271                         return 1;
5272                 }
5273                 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5274         }
5275
5276         /*
5277          * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5278          * MMIO, it is better to report an internal error.
5279          * See the comments in vmx_handle_exit.
5280          */
5281         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5282             !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5283                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5284                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5285                 vcpu->run->internal.ndata = 4;
5286                 vcpu->run->internal.data[0] = vect_info;
5287                 vcpu->run->internal.data[1] = intr_info;
5288                 vcpu->run->internal.data[2] = error_code;
5289                 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
5290                 return 0;
5291         }
5292
5293         if (is_page_fault(intr_info)) {
5294                 cr2 = vmx_get_exit_qual(vcpu);
5295                 if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
5296                         /*
5297                          * EPT will cause page fault only if we need to
5298                          * detect illegal GPAs.
5299                          */
5300                         WARN_ON_ONCE(!allow_smaller_maxphyaddr);
5301                         kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
5302                         return 1;
5303                 } else
5304                         return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5305         }
5306
5307         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5308
5309         if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5310                 return handle_rmode_exception(vcpu, ex_no, error_code);
5311
5312         switch (ex_no) {
5313         case DB_VECTOR:
5314                 dr6 = vmx_get_exit_qual(vcpu);
5315                 if (!(vcpu->guest_debug &
5316                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5317                         /*
5318                          * If the #DB was due to ICEBP, a.k.a. INT1, skip the
5319                          * instruction.  ICEBP generates a trap-like #DB, but
5320                          * despite its interception control being tied to #DB,
5321                          * is an instruction intercept, i.e. the VM-Exit occurs
5322                          * on the ICEBP itself.  Use the inner "skip" helper to
5323                          * avoid single-step #DB and MTF updates, as ICEBP is
5324                          * higher priority.  Note, skipping ICEBP still clears
5325                          * STI and MOVSS blocking.
5326                          *
5327                          * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5328                          * if single-step is enabled in RFLAGS and STI or MOVSS
5329                          * blocking is active, as the CPU doesn't set the bit
5330                          * on VM-Exit due to #DB interception.  VM-Entry has a
5331                          * consistency check that a single-step #DB is pending
5332                          * in this scenario as the previous instruction cannot
5333                          * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5334                          * don't modify RFLAGS), therefore the one instruction
5335                          * delay when activating single-step breakpoints must
5336                          * have already expired.  Note, the CPU sets/clears BS
5337                          * as appropriate for all other VM-Exits types.
5338                          */
5339                         if (is_icebp(intr_info))
5340                                 WARN_ON(!skip_emulated_instruction(vcpu));
5341                         else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5342                                  (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5343                                   (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
5344                                 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
5345                                             vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
5346
5347                         kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
5348                         return 1;
5349                 }
5350                 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
5351                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5352                 fallthrough;
5353         case BP_VECTOR:
5354                 /*
5355                  * Update instruction length as we may reinject #BP from
5356                  * user space while in guest debugging mode. Reading it for
5357                  * #DB as well causes no harm, it is not used in that case.
5358                  */
5359                 vmx->vcpu.arch.event_exit_inst_len =
5360                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5361                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
5362                 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5363                 kvm_run->debug.arch.exception = ex_no;
5364                 break;
5365         case AC_VECTOR:
5366                 if (vmx_guest_inject_ac(vcpu)) {
5367                         kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5368                         return 1;
5369                 }
5370
5371                 /*
5372                  * Handle split lock. Depending on detection mode this will
5373                  * either warn and disable split lock detection for this
5374                  * task or force SIGBUS on it.
5375                  */
5376                 if (handle_guest_split_lock(kvm_rip_read(vcpu)))
5377                         return 1;
5378                 fallthrough;
5379         default:
5380                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5381                 kvm_run->ex.exception = ex_no;
5382                 kvm_run->ex.error_code = error_code;
5383                 break;
5384         }
5385         return 0;
5386 }
5387
5388 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5389 {
5390         ++vcpu->stat.irq_exits;
5391         return 1;
5392 }
5393
5394 static int handle_triple_fault(struct kvm_vcpu *vcpu)
5395 {
5396         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5397         vcpu->mmio_needed = 0;
5398         return 0;
5399 }
5400
5401 static int handle_io(struct kvm_vcpu *vcpu)
5402 {
5403         unsigned long exit_qualification;
5404         int size, in, string;
5405         unsigned port;
5406
5407         exit_qualification = vmx_get_exit_qual(vcpu);
5408         string = (exit_qualification & 16) != 0;
5409
5410         ++vcpu->stat.io_exits;
5411
5412         if (string)
5413                 return kvm_emulate_instruction(vcpu, 0);
5414
5415         port = exit_qualification >> 16;
5416         size = (exit_qualification & 7) + 1;
5417         in = (exit_qualification & 8) != 0;
5418
5419         return kvm_fast_pio(vcpu, size, port, in);
5420 }
5421
5422 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5423 {
5424         /*
5425          * Patch in the VMCALL instruction:
5426          */
5427         hypercall[0] = 0x0f;
5428         hypercall[1] = 0x01;
5429         hypercall[2] = 0xc1;
5430 }
5431
5432 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5433 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5434 {
5435         if (is_guest_mode(vcpu)) {
5436                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5437                 unsigned long orig_val = val;
5438
5439                 /*
5440                  * We get here when L2 changed cr0 in a way that did not change
5441                  * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5442                  * but did change L0 shadowed bits. So we first calculate the
5443                  * effective cr0 value that L1 would like to write into the
5444                  * hardware. It consists of the L2-owned bits from the new
5445                  * value combined with the L1-owned bits from L1's guest_cr0.
5446                  */
5447                 val = (val & ~vmcs12->cr0_guest_host_mask) |
5448                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5449
5450                 if (kvm_set_cr0(vcpu, val))
5451                         return 1;
5452                 vmcs_writel(CR0_READ_SHADOW, orig_val);
5453                 return 0;
5454         } else {
5455                 return kvm_set_cr0(vcpu, val);
5456         }
5457 }
5458
5459 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5460 {
5461         if (is_guest_mode(vcpu)) {
5462                 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5463                 unsigned long orig_val = val;
5464
5465                 /* analogously to handle_set_cr0 */
5466                 val = (val & ~vmcs12->cr4_guest_host_mask) |
5467                         (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5468                 if (kvm_set_cr4(vcpu, val))
5469                         return 1;
5470                 vmcs_writel(CR4_READ_SHADOW, orig_val);
5471                 return 0;
5472         } else
5473                 return kvm_set_cr4(vcpu, val);
5474 }
5475
5476 static int handle_desc(struct kvm_vcpu *vcpu)
5477 {
5478         /*
5479          * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
5480          * and other code needs to be updated if UMIP can be guest owned.
5481          */
5482         BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
5483
5484         WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
5485         return kvm_emulate_instruction(vcpu, 0);
5486 }
5487
5488 static int handle_cr(struct kvm_vcpu *vcpu)
5489 {
5490         unsigned long exit_qualification, val;
5491         int cr;
5492         int reg;
5493         int err;
5494         int ret;
5495
5496         exit_qualification = vmx_get_exit_qual(vcpu);
5497         cr = exit_qualification & 15;
5498         reg = (exit_qualification >> 8) & 15;
5499         switch ((exit_qualification >> 4) & 3) {
5500         case 0: /* mov to cr */
5501                 val = kvm_register_read(vcpu, reg);
5502                 trace_kvm_cr_write(cr, val);
5503                 switch (cr) {
5504                 case 0:
5505                         err = handle_set_cr0(vcpu, val);
5506                         return kvm_complete_insn_gp(vcpu, err);
5507                 case 3:
5508                         WARN_ON_ONCE(enable_unrestricted_guest);
5509
5510                         err = kvm_set_cr3(vcpu, val);
5511                         return kvm_complete_insn_gp(vcpu, err);
5512                 case 4:
5513                         err = handle_set_cr4(vcpu, val);
5514                         return kvm_complete_insn_gp(vcpu, err);
5515                 case 8: {
5516                                 u8 cr8_prev = kvm_get_cr8(vcpu);
5517                                 u8 cr8 = (u8)val;
5518                                 err = kvm_set_cr8(vcpu, cr8);
5519                                 ret = kvm_complete_insn_gp(vcpu, err);
5520                                 if (lapic_in_kernel(vcpu))
5521                                         return ret;
5522                                 if (cr8_prev <= cr8)
5523                                         return ret;
5524                                 /*
5525                                  * TODO: we might be squashing a
5526                                  * KVM_GUESTDBG_SINGLESTEP-triggered
5527                                  * KVM_EXIT_DEBUG here.
5528                                  */
5529                                 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5530                                 return 0;
5531                         }
5532                 }
5533                 break;
5534         case 2: /* clts */
5535                 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
5536                 return -EIO;
5537         case 1: /*mov from cr*/
5538                 switch (cr) {
5539                 case 3:
5540                         WARN_ON_ONCE(enable_unrestricted_guest);
5541
5542                         val = kvm_read_cr3(vcpu);
5543                         kvm_register_write(vcpu, reg, val);
5544                         trace_kvm_cr_read(cr, val);
5545                         return kvm_skip_emulated_instruction(vcpu);
5546                 case 8:
5547                         val = kvm_get_cr8(vcpu);
5548                         kvm_register_write(vcpu, reg, val);
5549                         trace_kvm_cr_read(cr, val);
5550                         return kvm_skip_emulated_instruction(vcpu);
5551                 }
5552                 break;
5553         case 3: /* lmsw */
5554                 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5555                 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
5556                 kvm_lmsw(vcpu, val);
5557
5558                 return kvm_skip_emulated_instruction(vcpu);
5559         default:
5560                 break;
5561         }
5562         vcpu->run->exit_reason = 0;
5563         vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5564                (int)(exit_qualification >> 4) & 3, cr);
5565         return 0;
5566 }
5567
5568 static int handle_dr(struct kvm_vcpu *vcpu)
5569 {
5570         unsigned long exit_qualification;
5571         int dr, dr7, reg;
5572         int err = 1;
5573
5574         exit_qualification = vmx_get_exit_qual(vcpu);
5575         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5576
5577         /* First, if DR does not exist, trigger UD */
5578         if (!kvm_require_dr(vcpu, dr))
5579                 return 1;
5580
5581         if (vmx_get_cpl(vcpu) > 0)
5582                 goto out;
5583
5584         dr7 = vmcs_readl(GUEST_DR7);
5585         if (dr7 & DR7_GD) {
5586                 /*
5587                  * As the vm-exit takes precedence over the debug trap, we
5588                  * need to emulate the latter, either for the host or the
5589                  * guest debugging itself.
5590                  */
5591                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5592                         vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
5593                         vcpu->run->debug.arch.dr7 = dr7;
5594                         vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5595                         vcpu->run->debug.arch.exception = DB_VECTOR;
5596                         vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5597                         return 0;
5598                 } else {
5599                         kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5600                         return 1;
5601                 }
5602         }
5603
5604         if (vcpu->guest_debug == 0) {
5605                 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5606
5607                 /*
5608                  * No more DR vmexits; force a reload of the debug registers
5609                  * and reenter on this instruction.  The next vmexit will
5610                  * retrieve the full state of the debug registers.
5611                  */
5612                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5613                 return 1;
5614         }
5615
5616         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5617         if (exit_qualification & TYPE_MOV_FROM_DR) {
5618                 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
5619                 err = 0;
5620         } else {
5621                 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
5622         }
5623
5624 out:
5625         return kvm_complete_insn_gp(vcpu, err);
5626 }
5627
5628 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5629 {
5630         get_debugreg(vcpu->arch.db[0], 0);
5631         get_debugreg(vcpu->arch.db[1], 1);
5632         get_debugreg(vcpu->arch.db[2], 2);
5633         get_debugreg(vcpu->arch.db[3], 3);
5634         get_debugreg(vcpu->arch.dr6, 6);
5635         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5636
5637         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5638         exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5639
5640         /*
5641          * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5642          * a stale dr6 from the guest.
5643          */
5644         set_debugreg(DR6_RESERVED, 6);
5645 }
5646
5647 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5648 {
5649         vmcs_writel(GUEST_DR7, val);
5650 }
5651
5652 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5653 {
5654         kvm_apic_update_ppr(vcpu);
5655         return 1;
5656 }
5657
5658 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5659 {
5660         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5661
5662         kvm_make_request(KVM_REQ_EVENT, vcpu);
5663
5664         ++vcpu->stat.irq_window_exits;
5665         return 1;
5666 }
5667
5668 static int handle_invlpg(struct kvm_vcpu *vcpu)
5669 {
5670         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5671
5672         kvm_mmu_invlpg(vcpu, exit_qualification);
5673         return kvm_skip_emulated_instruction(vcpu);
5674 }
5675
5676 static int handle_apic_access(struct kvm_vcpu *vcpu)
5677 {
5678         if (likely(fasteoi)) {
5679                 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5680                 int access_type, offset;
5681
5682                 access_type = exit_qualification & APIC_ACCESS_TYPE;
5683                 offset = exit_qualification & APIC_ACCESS_OFFSET;
5684                 /*
5685                  * Sane guest uses MOV to write EOI, with written value
5686                  * not cared. So make a short-circuit here by avoiding
5687                  * heavy instruction emulation.
5688                  */
5689                 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5690                     (offset == APIC_EOI)) {
5691                         kvm_lapic_set_eoi(vcpu);
5692                         return kvm_skip_emulated_instruction(vcpu);
5693                 }
5694         }
5695         return kvm_emulate_instruction(vcpu, 0);
5696 }
5697
5698 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5699 {
5700         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5701         int vector = exit_qualification & 0xff;
5702
5703         /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5704         kvm_apic_set_eoi_accelerated(vcpu, vector);
5705         return 1;
5706 }
5707
5708 static int handle_apic_write(struct kvm_vcpu *vcpu)
5709 {
5710         unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5711
5712         /*
5713          * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5714          * hardware has done any necessary aliasing, offset adjustments, etc...
5715          * for the access.  I.e. the correct value has already been  written to
5716          * the vAPIC page for the correct 16-byte chunk.  KVM needs only to
5717          * retrieve the register value and emulate the access.
5718          */
5719         u32 offset = exit_qualification & 0xff0;
5720
5721         kvm_apic_write_nodecode(vcpu, offset);
5722         return 1;
5723 }
5724
5725 static int handle_task_switch(struct kvm_vcpu *vcpu)
5726 {
5727         struct vcpu_vmx *vmx = to_vmx(vcpu);
5728         unsigned long exit_qualification;
5729         bool has_error_code = false;
5730         u32 error_code = 0;
5731         u16 tss_selector;
5732         int reason, type, idt_v, idt_index;
5733
5734         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5735         idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5736         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5737
5738         exit_qualification = vmx_get_exit_qual(vcpu);
5739
5740         reason = (u32)exit_qualification >> 30;
5741         if (reason == TASK_SWITCH_GATE && idt_v) {
5742                 switch (type) {
5743                 case INTR_TYPE_NMI_INTR:
5744                         vcpu->arch.nmi_injected = false;
5745                         vmx_set_nmi_mask(vcpu, true);
5746                         break;
5747                 case INTR_TYPE_EXT_INTR:
5748                 case INTR_TYPE_SOFT_INTR:
5749                         kvm_clear_interrupt_queue(vcpu);
5750                         break;
5751                 case INTR_TYPE_HARD_EXCEPTION:
5752                         if (vmx->idt_vectoring_info &
5753                             VECTORING_INFO_DELIVER_CODE_MASK) {
5754                                 has_error_code = true;
5755                                 error_code =
5756                                         vmcs_read32(IDT_VECTORING_ERROR_CODE);
5757                         }
5758                         fallthrough;
5759                 case INTR_TYPE_SOFT_EXCEPTION:
5760                         kvm_clear_exception_queue(vcpu);
5761                         break;
5762                 default:
5763                         break;
5764                 }
5765         }
5766         tss_selector = exit_qualification;
5767
5768         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5769                        type != INTR_TYPE_EXT_INTR &&
5770                        type != INTR_TYPE_NMI_INTR))
5771                 WARN_ON(!skip_emulated_instruction(vcpu));
5772
5773         /*
5774          * TODO: What about debug traps on tss switch?
5775          *       Are we supposed to inject them and update dr6?
5776          */
5777         return kvm_task_switch(vcpu, tss_selector,
5778                                type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5779                                reason, has_error_code, error_code);
5780 }
5781
5782 static int handle_ept_violation(struct kvm_vcpu *vcpu)
5783 {
5784         unsigned long exit_qualification;
5785         gpa_t gpa;
5786         u64 error_code;
5787
5788         exit_qualification = vmx_get_exit_qual(vcpu);
5789
5790         /*
5791          * EPT violation happened while executing iret from NMI,
5792          * "blocked by NMI" bit has to be set before next VM entry.
5793          * There are errata that may cause this bit to not be set:
5794          * AAK134, BY25.
5795          */
5796         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5797                         enable_vnmi &&
5798                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5799                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5800
5801         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5802         trace_kvm_page_fault(vcpu, gpa, exit_qualification);
5803
5804         /* Is it a read fault? */
5805         error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5806                      ? PFERR_USER_MASK : 0;
5807         /* Is it a write fault? */
5808         error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5809                       ? PFERR_WRITE_MASK : 0;
5810         /* Is it a fetch fault? */
5811         error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5812                       ? PFERR_FETCH_MASK : 0;
5813         /* ept page table entry is present? */
5814         error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
5815                       ? PFERR_PRESENT_MASK : 0;
5816
5817         if (error_code & EPT_VIOLATION_GVA_IS_VALID)
5818                 error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
5819                               PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5820
5821         /*
5822          * Check that the GPA doesn't exceed physical memory limits, as that is
5823          * a guest page fault.  We have to emulate the instruction here, because
5824          * if the illegal address is that of a paging structure, then
5825          * EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
5826          * would also use advanced VM-exit information for EPT violations to
5827          * reconstruct the page fault error code.
5828          */
5829         if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
5830                 return kvm_emulate_instruction(vcpu, 0);
5831
5832         return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5833 }
5834
5835 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5836 {
5837         gpa_t gpa;
5838
5839         if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
5840                 return 1;
5841
5842         /*
5843          * A nested guest cannot optimize MMIO vmexits, because we have an
5844          * nGPA here instead of the required GPA.
5845          */
5846         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5847         if (!is_guest_mode(vcpu) &&
5848             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5849                 trace_kvm_fast_mmio(gpa);
5850                 return kvm_skip_emulated_instruction(vcpu);
5851         }
5852
5853         return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5854 }
5855
5856 static int handle_nmi_window(struct kvm_vcpu *vcpu)
5857 {
5858         if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5859                 return -EIO;
5860
5861         exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5862         ++vcpu->stat.nmi_window_exits;
5863         kvm_make_request(KVM_REQ_EVENT, vcpu);
5864
5865         return 1;
5866 }
5867
5868 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
5869 {
5870         struct vcpu_vmx *vmx = to_vmx(vcpu);
5871
5872         return vmx->emulation_required && !vmx->rmode.vm86_active &&
5873                (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
5874 }
5875
5876 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5877 {
5878         struct vcpu_vmx *vmx = to_vmx(vcpu);
5879         bool intr_window_requested;
5880         unsigned count = 130;
5881
5882         intr_window_requested = exec_controls_get(vmx) &
5883                                 CPU_BASED_INTR_WINDOW_EXITING;
5884
5885         while (vmx->emulation_required && count-- != 0) {
5886                 if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5887                         return handle_interrupt_window(&vmx->vcpu);
5888
5889                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5890                         return 1;
5891
5892                 if (!kvm_emulate_instruction(vcpu, 0))
5893                         return 0;
5894
5895                 if (vmx_emulation_required_with_pending_exception(vcpu)) {
5896                         kvm_prepare_emulation_failure_exit(vcpu);
5897                         return 0;
5898                 }
5899
5900                 if (vcpu->arch.halt_request) {
5901                         vcpu->arch.halt_request = 0;
5902                         return kvm_emulate_halt_noskip(vcpu);
5903                 }
5904
5905                 /*
5906                  * Note, return 1 and not 0, vcpu_run() will invoke
5907                  * xfer_to_guest_mode() which will create a proper return
5908                  * code.
5909                  */
5910                 if (__xfer_to_guest_mode_work_pending())
5911                         return 1;
5912         }
5913
5914         return 1;
5915 }
5916
5917 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
5918 {
5919         if (vmx_emulation_required_with_pending_exception(vcpu)) {
5920                 kvm_prepare_emulation_failure_exit(vcpu);
5921                 return 0;
5922         }
5923
5924         return 1;
5925 }
5926
5927 /*
5928  * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5929  * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5930  */
5931 static int handle_pause(struct kvm_vcpu *vcpu)
5932 {
5933         if (!kvm_pause_in_guest(vcpu->kvm))
5934                 grow_ple_window(vcpu);
5935
5936         /*
5937          * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5938          * VM-execution control is ignored if CPL > 0. OTOH, KVM
5939          * never set PAUSE_EXITING and just set PLE if supported,
5940          * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5941          */
5942         kvm_vcpu_on_spin(vcpu, true);
5943         return kvm_skip_emulated_instruction(vcpu);
5944 }
5945
5946 static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5947 {
5948         return 1;
5949 }
5950
5951 static int handle_invpcid(struct kvm_vcpu *vcpu)
5952 {
5953         u32 vmx_instruction_info;
5954         unsigned long type;
5955         gva_t gva;
5956         struct {
5957                 u64 pcid;
5958                 u64 gla;
5959         } operand;
5960         int gpr_index;
5961
5962         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5963                 kvm_queue_exception(vcpu, UD_VECTOR);
5964                 return 1;
5965         }
5966
5967         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5968         gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
5969         type = kvm_register_read(vcpu, gpr_index);
5970
5971         /* According to the Intel instruction reference, the memory operand
5972          * is read even if it isn't needed (e.g., for type==all)
5973          */
5974         if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5975                                 vmx_instruction_info, false,
5976                                 sizeof(operand), &gva))
5977                 return 1;
5978
5979         return kvm_handle_invpcid(vcpu, type, gva);
5980 }
5981
5982 static int handle_pml_full(struct kvm_vcpu *vcpu)
5983 {
5984         unsigned long exit_qualification;
5985
5986         trace_kvm_pml_full(vcpu->vcpu_id);
5987
5988         exit_qualification = vmx_get_exit_qual(vcpu);
5989
5990         /*
5991          * PML buffer FULL happened while executing iret from NMI,
5992          * "blocked by NMI" bit has to be set before next VM entry.
5993          */
5994         if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5995                         enable_vnmi &&
5996                         (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5997                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5998                                 GUEST_INTR_STATE_NMI);
5999
6000         /*
6001          * PML buffer already flushed at beginning of VMEXIT. Nothing to do
6002          * here.., and there's no userspace involvement needed for PML.
6003          */
6004         return 1;
6005 }
6006
6007 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
6008                                                    bool force_immediate_exit)
6009 {
6010         struct vcpu_vmx *vmx = to_vmx(vcpu);
6011
6012         /*
6013          * In the *extremely* unlikely scenario that this is a spurious VM-Exit
6014          * due to the timer expiring while it was "soft" disabled, just eat the
6015          * exit and re-enter the guest.
6016          */
6017         if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
6018                 return EXIT_FASTPATH_REENTER_GUEST;
6019
6020         /*
6021          * If the timer expired because KVM used it to force an immediate exit,
6022          * then mission accomplished.
6023          */
6024         if (force_immediate_exit)
6025                 return EXIT_FASTPATH_EXIT_HANDLED;
6026
6027         /*
6028          * If L2 is active, go down the slow path as emulating the guest timer
6029          * expiration likely requires synthesizing a nested VM-Exit.
6030          */
6031         if (is_guest_mode(vcpu))
6032                 return EXIT_FASTPATH_NONE;
6033
6034         kvm_lapic_expired_hv_timer(vcpu);
6035         return EXIT_FASTPATH_REENTER_GUEST;
6036 }
6037
6038 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6039 {
6040         /*
6041          * This non-fastpath handler is reached if and only if the preemption
6042          * timer was being used to emulate a guest timer while L2 is active.
6043          * All other scenarios are supposed to be handled in the fastpath.
6044          */
6045         WARN_ON_ONCE(!is_guest_mode(vcpu));
6046         kvm_lapic_expired_hv_timer(vcpu);
6047         return 1;
6048 }
6049
6050 /*
6051  * When nested=0, all VMX instruction VM Exits filter here.  The handlers
6052  * are overwritten by nested_vmx_setup() when nested=1.
6053  */
6054 static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6055 {
6056         kvm_queue_exception(vcpu, UD_VECTOR);
6057         return 1;
6058 }
6059
6060 #ifndef CONFIG_X86_SGX_KVM
6061 static int handle_encls(struct kvm_vcpu *vcpu)
6062 {
6063         /*
6064          * SGX virtualization is disabled.  There is no software enable bit for
6065          * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6066          * the guest from executing ENCLS (when SGX is supported by hardware).
6067          */
6068         kvm_queue_exception(vcpu, UD_VECTOR);
6069         return 1;
6070 }
6071 #endif /* CONFIG_X86_SGX_KVM */
6072
6073 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6074 {
6075         /*
6076          * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6077          * VM-Exits. Unconditionally set the flag here and leave the handling to
6078          * vmx_handle_exit().
6079          */
6080         to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
6081         return 1;
6082 }
6083
6084 static int handle_notify(struct kvm_vcpu *vcpu)
6085 {
6086         unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6087         bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6088
6089         ++vcpu->stat.notify_window_exits;
6090
6091         /*
6092          * Notify VM exit happened while executing iret from NMI,
6093          * "blocked by NMI" bit has to be set before next VM entry.
6094          */
6095         if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6096                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6097                               GUEST_INTR_STATE_NMI);
6098
6099         if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
6100             context_invalid) {
6101                 vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6102                 vcpu->run->notify.flags = context_invalid ?
6103                                           KVM_NOTIFY_CONTEXT_INVALID : 0;
6104                 return 0;
6105         }
6106
6107         return 1;
6108 }
6109
6110 /*
6111  * The exit handlers return 1 if the exit was handled fully and guest execution
6112  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
6113  * to be done to userspace and return 0.
6114  */
6115 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6116         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
6117         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
6118         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
6119         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
6120         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
6121         [EXIT_REASON_CR_ACCESS]               = handle_cr,
6122         [EXIT_REASON_DR_ACCESS]               = handle_dr,
6123         [EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
6124         [EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
6125         [EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
6126         [EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
6127         [EXIT_REASON_HLT]                     = kvm_emulate_halt,
6128         [EXIT_REASON_INVD]                    = kvm_emulate_invd,
6129         [EXIT_REASON_INVLPG]                  = handle_invlpg,
6130         [EXIT_REASON_RDPMC]                   = kvm_emulate_rdpmc,
6131         [EXIT_REASON_VMCALL]                  = kvm_emulate_hypercall,
6132         [EXIT_REASON_VMCLEAR]                 = handle_vmx_instruction,
6133         [EXIT_REASON_VMLAUNCH]                = handle_vmx_instruction,
6134         [EXIT_REASON_VMPTRLD]                 = handle_vmx_instruction,
6135         [EXIT_REASON_VMPTRST]                 = handle_vmx_instruction,
6136         [EXIT_REASON_VMREAD]                  = handle_vmx_instruction,
6137         [EXIT_REASON_VMRESUME]                = handle_vmx_instruction,
6138         [EXIT_REASON_VMWRITE]                 = handle_vmx_instruction,
6139         [EXIT_REASON_VMOFF]                   = handle_vmx_instruction,
6140         [EXIT_REASON_VMON]                    = handle_vmx_instruction,
6141         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
6142         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
6143         [EXIT_REASON_APIC_WRITE]              = handle_apic_write,
6144         [EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
6145         [EXIT_REASON_WBINVD]                  = kvm_emulate_wbinvd,
6146         [EXIT_REASON_XSETBV]                  = kvm_emulate_xsetbv,
6147         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
6148         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
6149         [EXIT_REASON_GDTR_IDTR]               = handle_desc,
6150         [EXIT_REASON_LDTR_TR]                 = handle_desc,
6151         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
6152         [EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
6153         [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
6154         [EXIT_REASON_MWAIT_INSTRUCTION]       = kvm_emulate_mwait,
6155         [EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
6156         [EXIT_REASON_MONITOR_INSTRUCTION]     = kvm_emulate_monitor,
6157         [EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
6158         [EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
6159         [EXIT_REASON_RDRAND]                  = kvm_handle_invalid_op,
6160         [EXIT_REASON_RDSEED]                  = kvm_handle_invalid_op,
6161         [EXIT_REASON_PML_FULL]                = handle_pml_full,
6162         [EXIT_REASON_INVPCID]                 = handle_invpcid,
6163         [EXIT_REASON_VMFUNC]                  = handle_vmx_instruction,
6164         [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
6165         [EXIT_REASON_ENCLS]                   = handle_encls,
6166         [EXIT_REASON_BUS_LOCK]                = handle_bus_lock_vmexit,
6167         [EXIT_REASON_NOTIFY]                  = handle_notify,
6168 };
6169
6170 static const int kvm_vmx_max_exit_handlers =
6171         ARRAY_SIZE(kvm_vmx_exit_handlers);
6172
6173 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
6174                        u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
6175 {
6176         struct vcpu_vmx *vmx = to_vmx(vcpu);
6177
6178         *reason = vmx->exit_reason.full;
6179         *info1 = vmx_get_exit_qual(vcpu);
6180         if (!(vmx->exit_reason.failed_vmentry)) {
6181                 *info2 = vmx->idt_vectoring_info;
6182                 *intr_info = vmx_get_intr_info(vcpu);
6183                 if (is_exception_with_error_code(*intr_info))
6184                         *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6185                 else
6186                         *error_code = 0;
6187         } else {
6188                 *info2 = 0;
6189                 *intr_info = 0;
6190                 *error_code = 0;
6191         }
6192 }
6193
6194 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6195 {
6196         if (vmx->pml_pg) {
6197                 __free_page(vmx->pml_pg);
6198                 vmx->pml_pg = NULL;
6199         }
6200 }
6201
6202 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6203 {
6204         struct vcpu_vmx *vmx = to_vmx(vcpu);
6205         u64 *pml_buf;
6206         u16 pml_idx;
6207
6208         pml_idx = vmcs_read16(GUEST_PML_INDEX);
6209
6210         /* Do nothing if PML buffer is empty */
6211         if (pml_idx == (PML_ENTITY_NUM - 1))
6212                 return;
6213
6214         /* PML index always points to next available PML buffer entity */
6215         if (pml_idx >= PML_ENTITY_NUM)
6216                 pml_idx = 0;
6217         else
6218                 pml_idx++;
6219
6220         pml_buf = page_address(vmx->pml_pg);
6221         for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
6222                 u64 gpa;
6223
6224                 gpa = pml_buf[pml_idx];
6225                 WARN_ON(gpa & (PAGE_SIZE - 1));
6226                 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
6227         }
6228
6229         /* reset PML index */
6230         vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6231 }
6232
6233 static void vmx_dump_sel(char *name, uint32_t sel)
6234 {
6235         pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6236                name, vmcs_read16(sel),
6237                vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6238                vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6239                vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6240 }
6241
6242 static void vmx_dump_dtsel(char *name, uint32_t limit)
6243 {
6244         pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
6245                name, vmcs_read32(limit),
6246                vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6247 }
6248
6249 static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
6250 {
6251         unsigned int i;
6252         struct vmx_msr_entry *e;
6253
6254         pr_err("MSR %s:\n", name);
6255         for (i = 0, e = m->val; i < m->nr; ++i, ++e)
6256                 pr_err("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6257 }
6258
6259 void dump_vmcs(struct kvm_vcpu *vcpu)
6260 {
6261         struct vcpu_vmx *vmx = to_vmx(vcpu);
6262         u32 vmentry_ctl, vmexit_ctl;
6263         u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6264         u64 tertiary_exec_control;
6265         unsigned long cr4;
6266         int efer_slot;
6267
6268         if (!dump_invalid_vmcs) {
6269                 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6270                 return;
6271         }
6272
6273         vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6274         vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6275         cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6276         pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6277         cr4 = vmcs_readl(GUEST_CR4);
6278
6279         if (cpu_has_secondary_exec_ctrls())
6280                 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6281         else
6282                 secondary_exec_control = 0;
6283
6284         if (cpu_has_tertiary_exec_ctrls())
6285                 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
6286         else
6287                 tertiary_exec_control = 0;
6288
6289         pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6290                vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
6291         pr_err("*** Guest State ***\n");
6292         pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6293                vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6294                vmcs_readl(CR0_GUEST_HOST_MASK));
6295         pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6296                cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6297         pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6298         if (cpu_has_vmx_ept()) {
6299                 pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
6300                        vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6301                 pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
6302                        vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6303         }
6304         pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
6305                vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6306         pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
6307                vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6308         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6309                vmcs_readl(GUEST_SYSENTER_ESP),
6310                vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6311         vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
6312         vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
6313         vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
6314         vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
6315         vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
6316         vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
6317         vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6318         vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6319         vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6320         vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
6321         efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
6322         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
6323                 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
6324         else if (efer_slot >= 0)
6325                 pr_err("EFER= 0x%016llx (autoload)\n",
6326                        vmx->msr_autoload.guest.val[efer_slot].value);
6327         else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6328                 pr_err("EFER= 0x%016llx (effective)\n",
6329                        vcpu->arch.efer | (EFER_LMA | EFER_LME));
6330         else
6331                 pr_err("EFER= 0x%016llx (effective)\n",
6332                        vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
6333         if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
6334                 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
6335         pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
6336                vmcs_read64(GUEST_IA32_DEBUGCTL),
6337                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6338         if (cpu_has_load_perf_global_ctrl() &&
6339             vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6340                 pr_err("PerfGlobCtl = 0x%016llx\n",
6341                        vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6342         if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6343                 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6344         pr_err("Interruptibility = %08x  ActivityState = %08x\n",
6345                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6346                vmcs_read32(GUEST_ACTIVITY_STATE));
6347         if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6348                 pr_err("InterruptStatus = %04x\n",
6349                        vmcs_read16(GUEST_INTR_STATUS));
6350         if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
6351                 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
6352         if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
6353                 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
6354
6355         pr_err("*** Host State ***\n");
6356         pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
6357                vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6358         pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6359                vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6360                vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6361                vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6362                vmcs_read16(HOST_TR_SELECTOR));
6363         pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6364                vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6365                vmcs_readl(HOST_TR_BASE));
6366         pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6367                vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6368         pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6369                vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6370                vmcs_readl(HOST_CR4));
6371         pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6372                vmcs_readl(HOST_IA32_SYSENTER_ESP),
6373                vmcs_read32(HOST_IA32_SYSENTER_CS),
6374                vmcs_readl(HOST_IA32_SYSENTER_EIP));
6375         if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6376                 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6377         if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6378                 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
6379         if (cpu_has_load_perf_global_ctrl() &&
6380             vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6381                 pr_err("PerfGlobCtl = 0x%016llx\n",
6382                        vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6383         if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
6384                 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
6385
6386         pr_err("*** Control State ***\n");
6387         pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6388                cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6389         pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6390                pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
6391         pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6392                vmcs_read32(EXCEPTION_BITMAP),
6393                vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6394                vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6395         pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6396                vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6397                vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6398                vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6399         pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6400                vmcs_read32(VM_EXIT_INTR_INFO),
6401                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6402                vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6403         pr_err("        reason=%08x qualification=%016lx\n",
6404                vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6405         pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6406                vmcs_read32(IDT_VECTORING_INFO_FIELD),
6407                vmcs_read32(IDT_VECTORING_ERROR_CODE));
6408         pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6409         if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6410                 pr_err("TSC Multiplier = 0x%016llx\n",
6411                        vmcs_read64(TSC_MULTIPLIER));
6412         if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6413                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6414                         u16 status = vmcs_read16(GUEST_INTR_STATUS);
6415                         pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6416                 }
6417                 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6418                 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6419                         pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6420                 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6421         }
6422         if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6423                 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6424         if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6425                 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6426         if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6427                 pr_err("PLE Gap=%08x Window=%08x\n",
6428                        vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6429         if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6430                 pr_err("Virtual processor ID = 0x%04x\n",
6431                        vmcs_read16(VIRTUAL_PROCESSOR_ID));
6432         if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
6433                 struct vmx_ve_information *ve_info = vmx->ve_info;
6434                 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
6435
6436                 /*
6437                  * If KVM is dumping the VMCS, then something has gone wrong
6438                  * already.  Derefencing an address from the VMCS, which could
6439                  * very well be corrupted, is a terrible idea.  The virtual
6440                  * address is known so use it.
6441                  */
6442                 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
6443                        ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
6444                 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
6445                        ve_info->exit_reason, ve_info->delivery,
6446                        ve_info->exit_qualification,
6447                        ve_info->guest_linear_address,
6448                        ve_info->guest_physical_address, ve_info->eptp_index);
6449         }
6450 }
6451
6452 /*
6453  * The guest has exited.  See if we can fix it or if we need userspace
6454  * assistance.
6455  */
6456 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6457 {
6458         struct vcpu_vmx *vmx = to_vmx(vcpu);
6459         union vmx_exit_reason exit_reason = vmx->exit_reason;
6460         u32 vectoring_info = vmx->idt_vectoring_info;
6461         u16 exit_handler_index;
6462
6463         /*
6464          * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6465          * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6466          * querying dirty_bitmap, we only need to kick all vcpus out of guest
6467          * mode as if vcpus is in root mode, the PML buffer must has been
6468          * flushed already.  Note, PML is never enabled in hardware while
6469          * running L2.
6470          */
6471         if (enable_pml && !is_guest_mode(vcpu))
6472                 vmx_flush_pml_buffer(vcpu);
6473
6474         /*
6475          * KVM should never reach this point with a pending nested VM-Enter.
6476          * More specifically, short-circuiting VM-Entry to emulate L2 due to
6477          * invalid guest state should never happen as that means KVM knowingly
6478          * allowed a nested VM-Enter with an invalid vmcs12.  More below.
6479          */
6480         if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
6481                 return -EIO;
6482
6483         if (is_guest_mode(vcpu)) {
6484                 /*
6485                  * PML is never enabled when running L2, bail immediately if a
6486                  * PML full exit occurs as something is horribly wrong.
6487                  */
6488                 if (exit_reason.basic == EXIT_REASON_PML_FULL)
6489                         goto unexpected_vmexit;
6490
6491                 /*
6492                  * The host physical addresses of some pages of guest memory
6493                  * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6494                  * Page). The CPU may write to these pages via their host
6495                  * physical address while L2 is running, bypassing any
6496                  * address-translation-based dirty tracking (e.g. EPT write
6497                  * protection).
6498                  *
6499                  * Mark them dirty on every exit from L2 to prevent them from
6500                  * getting out of sync with dirty tracking.
6501                  */
6502                 nested_mark_vmcs12_pages_dirty(vcpu);
6503
6504                 /*
6505                  * Synthesize a triple fault if L2 state is invalid.  In normal
6506                  * operation, nested VM-Enter rejects any attempt to enter L2
6507                  * with invalid state.  However, those checks are skipped if
6508                  * state is being stuffed via RSM or KVM_SET_NESTED_STATE.  If
6509                  * L2 state is invalid, it means either L1 modified SMRAM state
6510                  * or userspace provided bad state.  Synthesize TRIPLE_FAULT as
6511                  * doing so is architecturally allowed in the RSM case, and is
6512                  * the least awful solution for the userspace case without
6513                  * risking false positives.
6514                  */
6515                 if (vmx->emulation_required) {
6516                         nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
6517                         return 1;
6518                 }
6519
6520                 if (nested_vmx_reflect_vmexit(vcpu))
6521                         return 1;
6522         }
6523
6524         /* If guest state is invalid, start emulating.  L2 is handled above. */
6525         if (vmx->emulation_required)
6526                 return handle_invalid_guest_state(vcpu);
6527
6528         if (exit_reason.failed_vmentry) {
6529                 dump_vmcs(vcpu);
6530                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6531                 vcpu->run->fail_entry.hardware_entry_failure_reason
6532                         = exit_reason.full;
6533                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6534                 return 0;
6535         }
6536
6537         if (unlikely(vmx->fail)) {
6538                 dump_vmcs(vcpu);
6539                 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6540                 vcpu->run->fail_entry.hardware_entry_failure_reason
6541                         = vmcs_read32(VM_INSTRUCTION_ERROR);
6542                 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6543                 return 0;
6544         }
6545
6546         /*
6547          * Note:
6548          * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6549          * delivery event since it indicates guest is accessing MMIO.
6550          * The vm-exit can be triggered again after return to guest that
6551          * will cause infinite loop.
6552          */
6553         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6554             (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6555              exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6556              exit_reason.basic != EXIT_REASON_PML_FULL &&
6557              exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6558              exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6559              exit_reason.basic != EXIT_REASON_NOTIFY)) {
6560                 int ndata = 3;
6561
6562                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6563                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6564                 vcpu->run->internal.data[0] = vectoring_info;
6565                 vcpu->run->internal.data[1] = exit_reason.full;
6566                 vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu);
6567                 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
6568                         vcpu->run->internal.data[ndata++] =
6569                                 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6570                 }
6571                 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
6572                 vcpu->run->internal.ndata = ndata;
6573                 return 0;
6574         }
6575
6576         if (unlikely(!enable_vnmi &&
6577                      vmx->loaded_vmcs->soft_vnmi_blocked)) {
6578                 if (!vmx_interrupt_blocked(vcpu)) {
6579                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6580                 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6581                            vcpu->arch.nmi_pending) {
6582                         /*
6583                          * This CPU don't support us in finding the end of an
6584                          * NMI-blocked window if the guest runs with IRQs
6585                          * disabled. So we pull the trigger after 1 s of
6586                          * futile waiting, but inform the user about this.
6587                          */
6588                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6589                                "state on VCPU %d after 1 s timeout\n",
6590                                __func__, vcpu->vcpu_id);
6591                         vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6592                 }
6593         }
6594
6595         if (exit_fastpath != EXIT_FASTPATH_NONE)
6596                 return 1;
6597
6598         if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6599                 goto unexpected_vmexit;
6600 #ifdef CONFIG_MITIGATION_RETPOLINE
6601         if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6602                 return kvm_emulate_wrmsr(vcpu);
6603         else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6604                 return handle_preemption_timer(vcpu);
6605         else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6606                 return handle_interrupt_window(vcpu);
6607         else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6608                 return handle_external_interrupt(vcpu);
6609         else if (exit_reason.basic == EXIT_REASON_HLT)
6610                 return kvm_emulate_halt(vcpu);
6611         else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6612                 return handle_ept_misconfig(vcpu);
6613 #endif
6614
6615         exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6616                                                 kvm_vmx_max_exit_handlers);
6617         if (!kvm_vmx_exit_handlers[exit_handler_index])
6618                 goto unexpected_vmexit;
6619
6620         return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6621
6622 unexpected_vmexit:
6623         vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6624                     exit_reason.full);
6625         dump_vmcs(vcpu);
6626         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6627         vcpu->run->internal.suberror =
6628                         KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
6629         vcpu->run->internal.ndata = 2;
6630         vcpu->run->internal.data[0] = exit_reason.full;
6631         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
6632         return 0;
6633 }
6634
6635 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6636 {
6637         int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6638
6639         /*
6640          * Exit to user space when bus lock detected to inform that there is
6641          * a bus lock in guest.
6642          */
6643         if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
6644                 if (ret > 0)
6645                         vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6646
6647                 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
6648                 return 0;
6649         }
6650         return ret;
6651 }
6652
6653 /*
6654  * Software based L1D cache flush which is used when microcode providing
6655  * the cache control MSR is not loaded.
6656  *
6657  * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6658  * flush it is required to read in 64 KiB because the replacement algorithm
6659  * is not exactly LRU. This could be sized at runtime via topology
6660  * information but as all relevant affected CPUs have 32KiB L1D cache size
6661  * there is no point in doing so.
6662  */
6663 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
6664 {
6665         int size = PAGE_SIZE << L1D_CACHE_ORDER;
6666
6667         /*
6668          * This code is only executed when the flush mode is 'cond' or
6669          * 'always'
6670          */
6671         if (static_branch_likely(&vmx_l1d_flush_cond)) {
6672                 bool flush_l1d;
6673
6674                 /*
6675                  * Clear the per-vcpu flush bit, it gets set again if the vCPU
6676                  * is reloaded, i.e. if the vCPU is scheduled out or if KVM
6677                  * exits to userspace, or if KVM reaches one of the unsafe
6678                  * VMEXIT handlers, e.g. if KVM calls into the emulator.
6679                  */
6680                 flush_l1d = vcpu->arch.l1tf_flush_l1d;
6681                 vcpu->arch.l1tf_flush_l1d = false;
6682
6683                 /*
6684                  * Clear the per-cpu flush bit, it gets set again from
6685                  * the interrupt handlers.
6686                  */
6687                 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6688                 kvm_clear_cpu_l1tf_flush_l1d();
6689
6690                 if (!flush_l1d)
6691                         return;
6692         }
6693
6694         vcpu->stat.l1d_flush++;
6695
6696         if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6697                 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6698                 return;
6699         }
6700
6701         asm volatile(
6702                 /* First ensure the pages are in the TLB */
6703                 "xorl   %%eax, %%eax\n"
6704                 ".Lpopulate_tlb:\n\t"
6705                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6706                 "addl   $4096, %%eax\n\t"
6707                 "cmpl   %%eax, %[size]\n\t"
6708                 "jne    .Lpopulate_tlb\n\t"
6709                 "xorl   %%eax, %%eax\n\t"
6710                 "cpuid\n\t"
6711                 /* Now fill the cache */
6712                 "xorl   %%eax, %%eax\n"
6713                 ".Lfill_cache:\n"
6714                 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6715                 "addl   $64, %%eax\n\t"
6716                 "cmpl   %%eax, %[size]\n\t"
6717                 "jne    .Lfill_cache\n\t"
6718                 "lfence\n"
6719                 :: [flush_pages] "r" (vmx_l1d_flush_pages),
6720                     [size] "r" (size)
6721                 : "eax", "ebx", "ecx", "edx");
6722 }
6723
6724 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6725 {
6726         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6727         int tpr_threshold;
6728
6729         if (is_guest_mode(vcpu) &&
6730                 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6731                 return;
6732
6733         tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6734         if (is_guest_mode(vcpu))
6735                 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6736         else
6737                 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6738 }
6739
6740 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6741 {
6742         struct vcpu_vmx *vmx = to_vmx(vcpu);
6743         u32 sec_exec_control;
6744
6745         if (!lapic_in_kernel(vcpu))
6746                 return;
6747
6748         if (!flexpriority_enabled &&
6749             !cpu_has_vmx_virtualize_x2apic_mode())
6750                 return;
6751
6752         /* Postpone execution until vmcs01 is the current VMCS. */
6753         if (is_guest_mode(vcpu)) {
6754                 vmx->nested.change_vmcs01_virtual_apic_mode = true;
6755                 return;
6756         }
6757
6758         sec_exec_control = secondary_exec_controls_get(vmx);
6759         sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6760                               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6761
6762         switch (kvm_get_apic_mode(vcpu)) {
6763         case LAPIC_MODE_INVALID:
6764                 WARN_ONCE(true, "Invalid local APIC state");
6765                 break;
6766         case LAPIC_MODE_DISABLED:
6767                 break;
6768         case LAPIC_MODE_XAPIC:
6769                 if (flexpriority_enabled) {
6770                         sec_exec_control |=
6771                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6772                         kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6773
6774                         /*
6775                          * Flush the TLB, reloading the APIC access page will
6776                          * only do so if its physical address has changed, but
6777                          * the guest may have inserted a non-APIC mapping into
6778                          * the TLB while the APIC access page was disabled.
6779                          */
6780                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6781                 }
6782                 break;
6783         case LAPIC_MODE_X2APIC:
6784                 if (cpu_has_vmx_virtualize_x2apic_mode())
6785                         sec_exec_control |=
6786                                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6787                 break;
6788         }
6789         secondary_exec_controls_set(vmx, sec_exec_control);
6790
6791         vmx_update_msr_bitmap_x2apic(vcpu);
6792 }
6793
6794 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6795 {
6796         const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
6797         struct kvm *kvm = vcpu->kvm;
6798         struct kvm_memslots *slots = kvm_memslots(kvm);
6799         struct kvm_memory_slot *slot;
6800         struct page *refcounted_page;
6801         unsigned long mmu_seq;
6802         kvm_pfn_t pfn;
6803         bool writable;
6804
6805         /* Defer reload until vmcs01 is the current VMCS. */
6806         if (is_guest_mode(vcpu)) {
6807                 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6808                 return;
6809         }
6810
6811         if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
6812             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6813                 return;
6814
6815         /*
6816          * Explicitly grab the memslot using KVM's internal slot ID to ensure
6817          * KVM doesn't unintentionally grab a userspace memslot.  It _should_
6818          * be impossible for userspace to create a memslot for the APIC when
6819          * APICv is enabled, but paranoia won't hurt in this case.
6820          */
6821         slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
6822         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
6823                 return;
6824
6825         /*
6826          * Ensure that the mmu_notifier sequence count is read before KVM
6827          * retrieves the pfn from the primary MMU.  Note, the memslot is
6828          * protected by SRCU, not the mmu_notifier.  Pairs with the smp_wmb()
6829          * in kvm_mmu_invalidate_end().
6830          */
6831         mmu_seq = kvm->mmu_invalidate_seq;
6832         smp_rmb();
6833
6834         /*
6835          * No need to retry if the memslot does not exist or is invalid.  KVM
6836          * controls the APIC-access page memslot, and only deletes the memslot
6837          * if APICv is permanently inhibited, i.e. the memslot won't reappear.
6838          */
6839         pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page);
6840         if (is_error_noslot_pfn(pfn))
6841                 return;
6842
6843         read_lock(&vcpu->kvm->mmu_lock);
6844         if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
6845                 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6846         else
6847                 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
6848
6849         /*
6850          * Do not pin the APIC access page in memory so that it can be freely
6851          * migrated, the MMU notifier will call us again if it is migrated or
6852          * swapped out.  KVM backs the memslot with anonymous memory, the pfn
6853          * should always point at a refcounted page (if the pfn is valid).
6854          */
6855         if (!WARN_ON_ONCE(!refcounted_page))
6856                 kvm_release_page_clean(refcounted_page);
6857
6858         /*
6859          * No need for a manual TLB flush at this point, KVM has already done a
6860          * flush if there were SPTEs pointing at the previous page.
6861          */
6862         read_unlock(&vcpu->kvm->mmu_lock);
6863 }
6864
6865 void vmx_hwapic_isr_update(int max_isr)
6866 {
6867         u16 status;
6868         u8 old;
6869
6870         if (max_isr == -1)
6871                 max_isr = 0;
6872
6873         status = vmcs_read16(GUEST_INTR_STATUS);
6874         old = status >> 8;
6875         if (max_isr != old) {
6876                 status &= 0xff;
6877                 status |= max_isr << 8;
6878                 vmcs_write16(GUEST_INTR_STATUS, status);
6879         }
6880 }
6881
6882 static void vmx_set_rvi(int vector)
6883 {
6884         u16 status;
6885         u8 old;
6886
6887         if (vector == -1)
6888                 vector = 0;
6889
6890         status = vmcs_read16(GUEST_INTR_STATUS);
6891         old = (u8)status & 0xff;
6892         if ((u8)vector != old) {
6893                 status &= ~0xff;
6894                 status |= (u8)vector;
6895                 vmcs_write16(GUEST_INTR_STATUS, status);
6896         }
6897 }
6898
6899 void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6900 {
6901         /*
6902          * When running L2, updating RVI is only relevant when
6903          * vmcs12 virtual-interrupt-delivery enabled.
6904          * However, it can be enabled only when L1 also
6905          * intercepts external-interrupts and in that case
6906          * we should not update vmcs02 RVI but instead intercept
6907          * interrupt. Therefore, do nothing when running L2.
6908          */
6909         if (!is_guest_mode(vcpu))
6910                 vmx_set_rvi(max_irr);
6911 }
6912
6913 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6914 {
6915         struct vcpu_vmx *vmx = to_vmx(vcpu);
6916         int max_irr;
6917         bool got_posted_interrupt;
6918
6919         if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
6920                 return -EIO;
6921
6922         if (pi_test_on(&vmx->pi_desc)) {
6923                 pi_clear_on(&vmx->pi_desc);
6924                 /*
6925                  * IOMMU can write to PID.ON, so the barrier matters even on UP.
6926                  * But on x86 this is just a compiler barrier anyway.
6927                  */
6928                 smp_mb__after_atomic();
6929                 got_posted_interrupt =
6930                         kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6931         } else {
6932                 max_irr = kvm_lapic_find_highest_irr(vcpu);
6933                 got_posted_interrupt = false;
6934         }
6935
6936         /*
6937          * Newly recognized interrupts are injected via either virtual interrupt
6938          * delivery (RVI) or KVM_REQ_EVENT.  Virtual interrupt delivery is
6939          * disabled in two cases:
6940          *
6941          * 1) If L2 is running and the vCPU has a new pending interrupt.  If L1
6942          * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
6943          * VM-Exit to L1.  If L1 doesn't want to exit, the interrupt is injected
6944          * into L2, but KVM doesn't use virtual interrupt delivery to inject
6945          * interrupts into L2, and so KVM_REQ_EVENT is again needed.
6946          *
6947          * 2) If APICv is disabled for this vCPU, assigned devices may still
6948          * attempt to post interrupts.  The posted interrupt vector will cause
6949          * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
6950          */
6951         if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
6952                 vmx_set_rvi(max_irr);
6953         else if (got_posted_interrupt)
6954                 kvm_make_request(KVM_REQ_EVENT, vcpu);
6955
6956         return max_irr;
6957 }
6958
6959 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6960 {
6961         if (!kvm_vcpu_apicv_active(vcpu))
6962                 return;
6963
6964         vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6965         vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6966         vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6967         vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6968 }
6969
6970 void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
6971 {
6972         struct vcpu_vmx *vmx = to_vmx(vcpu);
6973
6974         pi_clear_on(&vmx->pi_desc);
6975         memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6976 }
6977
6978 void vmx_do_interrupt_irqoff(unsigned long entry);
6979 void vmx_do_nmi_irqoff(void);
6980
6981 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
6982 {
6983         /*
6984          * Save xfd_err to guest_fpu before interrupt is enabled, so the
6985          * MSR value is not clobbered by the host activity before the guest
6986          * has chance to consume it.
6987          *
6988          * Do not blindly read xfd_err here, since this exception might
6989          * be caused by L1 interception on a platform which doesn't
6990          * support xfd at all.
6991          *
6992          * Do it conditionally upon guest_fpu::xfd. xfd_err matters
6993          * only when xfd contains a non-zero value.
6994          *
6995          * Queuing exception is done in vmx_handle_exit. See comment there.
6996          */
6997         if (vcpu->arch.guest_fpu.fpstate->xfd)
6998                 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
6999 }
7000
7001 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
7002 {
7003         /* if exit due to PF check for async PF */
7004         if (is_page_fault(intr_info))
7005                 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
7006         /* if exit due to NM, handle before interrupts are enabled */
7007         else if (is_nm_fault(intr_info))
7008                 handle_nm_fault_irqoff(vcpu);
7009         /* Handle machine checks before interrupts are enabled */
7010         else if (is_machine_check(intr_info))
7011                 kvm_machine_check();
7012 }
7013
7014 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
7015                                              u32 intr_info)
7016 {
7017         unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
7018
7019         if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
7020             "unexpected VM-Exit interrupt info: 0x%x", intr_info))
7021                 return;
7022
7023         kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
7024         if (cpu_feature_enabled(X86_FEATURE_FRED))
7025                 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
7026         else
7027                 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
7028         kvm_after_interrupt(vcpu);
7029
7030         vcpu->arch.at_instruction_boundary = true;
7031 }
7032
7033 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
7034 {
7035         struct vcpu_vmx *vmx = to_vmx(vcpu);
7036
7037         if (vmx->emulation_required)
7038                 return;
7039
7040         if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
7041                 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
7042         else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
7043                 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
7044 }
7045
7046 /*
7047  * The kvm parameter can be NULL (module initialization, or invocation before
7048  * VM creation). Be sure to check the kvm parameter before using it.
7049  */
7050 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
7051 {
7052         switch (index) {
7053         case MSR_IA32_SMBASE:
7054                 if (!IS_ENABLED(CONFIG_KVM_SMM))
7055                         return false;
7056                 /*
7057                  * We cannot do SMM unless we can run the guest in big
7058                  * real mode.
7059                  */
7060                 return enable_unrestricted_guest || emulate_invalid_guest_state;
7061         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
7062                 return nested;
7063         case MSR_AMD64_VIRT_SPEC_CTRL:
7064         case MSR_AMD64_TSC_RATIO:
7065                 /* This is AMD only.  */
7066                 return false;
7067         default:
7068                 return true;
7069         }
7070 }
7071
7072 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
7073 {
7074         u32 exit_intr_info;
7075         bool unblock_nmi;
7076         u8 vector;
7077         bool idtv_info_valid;
7078
7079         idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7080
7081         if (enable_vnmi) {
7082                 if (vmx->loaded_vmcs->nmi_known_unmasked)
7083                         return;
7084
7085                 exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
7086                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
7087                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7088                 /*
7089                  * SDM 3: 27.7.1.2 (September 2008)
7090                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
7091                  * a guest IRET fault.
7092                  * SDM 3: 23.2.2 (September 2008)
7093                  * Bit 12 is undefined in any of the following cases:
7094                  *  If the VM exit sets the valid bit in the IDT-vectoring
7095                  *   information field.
7096                  *  If the VM exit is due to a double fault.
7097                  */
7098                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7099                     vector != DF_VECTOR && !idtv_info_valid)
7100                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7101                                       GUEST_INTR_STATE_NMI);
7102                 else
7103                         vmx->loaded_vmcs->nmi_known_unmasked =
7104                                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7105                                   & GUEST_INTR_STATE_NMI);
7106         } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7107                 vmx->loaded_vmcs->vnmi_blocked_time +=
7108                         ktime_to_ns(ktime_sub(ktime_get(),
7109                                               vmx->loaded_vmcs->entry_time));
7110 }
7111
7112 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7113                                       u32 idt_vectoring_info,
7114                                       int instr_len_field,
7115                                       int error_code_field)
7116 {
7117         u8 vector;
7118         int type;
7119         bool idtv_info_valid;
7120
7121         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7122
7123         vcpu->arch.nmi_injected = false;
7124         kvm_clear_exception_queue(vcpu);
7125         kvm_clear_interrupt_queue(vcpu);
7126
7127         if (!idtv_info_valid)
7128                 return;
7129
7130         kvm_make_request(KVM_REQ_EVENT, vcpu);
7131
7132         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7133         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7134
7135         switch (type) {
7136         case INTR_TYPE_NMI_INTR:
7137                 vcpu->arch.nmi_injected = true;
7138                 /*
7139                  * SDM 3: 27.7.1.2 (September 2008)
7140                  * Clear bit "block by NMI" before VM entry if a NMI
7141                  * delivery faulted.
7142                  */
7143                 vmx_set_nmi_mask(vcpu, false);
7144                 break;
7145         case INTR_TYPE_SOFT_EXCEPTION:
7146                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7147                 fallthrough;
7148         case INTR_TYPE_HARD_EXCEPTION:
7149                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
7150                         u32 err = vmcs_read32(error_code_field);
7151                         kvm_requeue_exception_e(vcpu, vector, err);
7152                 } else
7153                         kvm_requeue_exception(vcpu, vector);
7154                 break;
7155         case INTR_TYPE_SOFT_INTR:
7156                 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7157                 fallthrough;
7158         case INTR_TYPE_EXT_INTR:
7159                 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7160                 break;
7161         default:
7162                 break;
7163         }
7164 }
7165
7166 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7167 {
7168         __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7169                                   VM_EXIT_INSTRUCTION_LEN,
7170                                   IDT_VECTORING_ERROR_CODE);
7171 }
7172
7173 void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7174 {
7175         __vmx_complete_interrupts(vcpu,
7176                                   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7177                                   VM_ENTRY_INSTRUCTION_LEN,
7178                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
7179
7180         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
7181 }
7182
7183 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7184 {
7185         int i, nr_msrs;
7186         struct perf_guest_switch_msr *msrs;
7187         struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7188
7189         pmu->host_cross_mapped_mask = 0;
7190         if (pmu->pebs_enable & pmu->global_ctrl)
7191                 intel_pmu_cross_mapped_check(pmu);
7192
7193         /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
7194         msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
7195         if (!msrs)
7196                 return;
7197
7198         for (i = 0; i < nr_msrs; i++)
7199                 if (msrs[i].host == msrs[i].guest)
7200                         clear_atomic_switch_msr(vmx, msrs[i].msr);
7201                 else
7202                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7203                                         msrs[i].host, false);
7204 }
7205
7206 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
7207 {
7208         struct vcpu_vmx *vmx = to_vmx(vcpu);
7209         u64 tscl;
7210         u32 delta_tsc;
7211
7212         if (force_immediate_exit) {
7213                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7214                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7215         } else if (vmx->hv_deadline_tsc != -1) {
7216                 tscl = rdtsc();
7217                 if (vmx->hv_deadline_tsc > tscl)
7218                         /* set_hv_timer ensures the delta fits in 32-bits */
7219                         delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7220                                 cpu_preemption_timer_multi);
7221                 else
7222                         delta_tsc = 0;
7223
7224                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7225                 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7226         } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7227                 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7228                 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7229         }
7230 }
7231
7232 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
7233 {
7234         if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7235                 vmx->loaded_vmcs->host_state.rsp = host_rsp;
7236                 vmcs_writel(HOST_RSP, host_rsp);
7237         }
7238 }
7239
7240 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7241                                         unsigned int flags)
7242 {
7243         u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7244
7245         if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7246                 return;
7247
7248         if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7249                 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
7250
7251         /*
7252          * If the guest/host SPEC_CTRL values differ, restore the host value.
7253          *
7254          * For legacy IBRS, the IBRS bit always needs to be written after
7255          * transitioning from a less privileged predictor mode, regardless of
7256          * whether the guest/host values differ.
7257          */
7258         if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
7259             vmx->spec_ctrl != hostval)
7260                 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
7261
7262         barrier_nospec();
7263 }
7264
7265 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
7266                                              bool force_immediate_exit)
7267 {
7268         /*
7269          * If L2 is active, some VMX preemption timer exits can be handled in
7270          * the fastpath even, all other exits must use the slow path.
7271          */
7272         if (is_guest_mode(vcpu) &&
7273             to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
7274                 return EXIT_FASTPATH_NONE;
7275
7276         switch (to_vmx(vcpu)->exit_reason.basic) {
7277         case EXIT_REASON_MSR_WRITE:
7278                 return handle_fastpath_set_msr_irqoff(vcpu);
7279         case EXIT_REASON_PREEMPTION_TIMER:
7280                 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
7281         case EXIT_REASON_HLT:
7282                 return handle_fastpath_hlt(vcpu);
7283         default:
7284                 return EXIT_FASTPATH_NONE;
7285         }
7286 }
7287
7288 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
7289                                         unsigned int flags)
7290 {
7291         struct vcpu_vmx *vmx = to_vmx(vcpu);
7292
7293         guest_state_enter_irqoff();
7294
7295         /*
7296          * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
7297          * mitigation for MDS is done late in VMentry and is still
7298          * executed in spite of L1D Flush. This is because an extra VERW
7299          * should not matter much after the big hammer L1D Flush.
7300          */
7301         if (static_branch_unlikely(&vmx_l1d_should_flush))
7302                 vmx_l1d_flush(vcpu);
7303         else if (static_branch_unlikely(&mmio_stale_data_clear) &&
7304                  kvm_arch_has_assigned_device(vcpu->kvm))
7305                 mds_clear_cpu_buffers();
7306
7307         vmx_disable_fb_clear(vmx);
7308
7309         if (vcpu->arch.cr2 != native_read_cr2())
7310                 native_write_cr2(vcpu->arch.cr2);
7311
7312         vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
7313                                    flags);
7314
7315         vcpu->arch.cr2 = native_read_cr2();
7316         vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7317
7318         vmx->idt_vectoring_info = 0;
7319
7320         vmx_enable_fb_clear(vmx);
7321
7322         if (unlikely(vmx->fail)) {
7323                 vmx->exit_reason.full = 0xdead;
7324                 goto out;
7325         }
7326
7327         vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
7328         if (likely(!vmx->exit_reason.failed_vmentry))
7329                 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7330
7331         if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
7332             is_nmi(vmx_get_intr_info(vcpu))) {
7333                 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
7334                 if (cpu_feature_enabled(X86_FEATURE_FRED))
7335                         fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
7336                 else
7337                         vmx_do_nmi_irqoff();
7338                 kvm_after_interrupt(vcpu);
7339         }
7340
7341 out:
7342         guest_state_exit_irqoff();
7343 }
7344
7345 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
7346 {
7347         struct vcpu_vmx *vmx = to_vmx(vcpu);
7348         unsigned long cr3, cr4;
7349
7350         /* Record the guest's net vcpu time for enforced NMI injections. */
7351         if (unlikely(!enable_vnmi &&
7352                      vmx->loaded_vmcs->soft_vnmi_blocked))
7353                 vmx->loaded_vmcs->entry_time = ktime_get();
7354
7355         /*
7356          * Don't enter VMX if guest state is invalid, let the exit handler
7357          * start emulation until we arrive back to a valid state.  Synthesize a
7358          * consistency check VM-Exit due to invalid guest state and bail.
7359          */
7360         if (unlikely(vmx->emulation_required)) {
7361                 vmx->fail = 0;
7362
7363                 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
7364                 vmx->exit_reason.failed_vmentry = 1;
7365                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
7366                 vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
7367                 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
7368                 vmx->exit_intr_info = 0;
7369                 return EXIT_FASTPATH_NONE;
7370         }
7371
7372         trace_kvm_entry(vcpu, force_immediate_exit);
7373
7374         if (vmx->ple_window_dirty) {
7375                 vmx->ple_window_dirty = false;
7376                 vmcs_write32(PLE_WINDOW, vmx->ple_window);
7377         }
7378
7379         /*
7380          * We did this in prepare_switch_to_guest, because it needs to
7381          * be within srcu_read_lock.
7382          */
7383         WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
7384
7385         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
7386                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
7387         if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
7388                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
7389         vcpu->arch.regs_dirty = 0;
7390
7391         /*
7392          * Refresh vmcs.HOST_CR3 if necessary.  This must be done immediately
7393          * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7394          * it switches back to the current->mm, which can occur in KVM context
7395          * when switching to a temporary mm to patch kernel code, e.g. if KVM
7396          * toggles a static key while handling a VM-Exit.
7397          */
7398         cr3 = __get_current_cr3_fast();
7399         if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7400                 vmcs_writel(HOST_CR3, cr3);
7401                 vmx->loaded_vmcs->host_state.cr3 = cr3;
7402         }
7403
7404         cr4 = cr4_read_shadow();
7405         if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7406                 vmcs_writel(HOST_CR4, cr4);
7407                 vmx->loaded_vmcs->host_state.cr4 = cr4;
7408         }
7409
7410         /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
7411         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
7412                 set_debugreg(vcpu->arch.dr6, 6);
7413
7414         /* When single-stepping over STI and MOV SS, we must clear the
7415          * corresponding interruptibility bits in the guest state. Otherwise
7416          * vmentry fails as it then expects bit 14 (BS) in pending debug
7417          * exceptions being set, but that's not correct for the guest debugging
7418          * case. */
7419         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7420                 vmx_set_interrupt_shadow(vcpu, 0);
7421
7422         kvm_load_guest_xsave_state(vcpu);
7423
7424         pt_guest_enter(vmx);
7425
7426         atomic_switch_perf_msrs(vmx);
7427         if (intel_pmu_lbr_is_enabled(vcpu))
7428                 vmx_passthrough_lbr_msrs(vcpu);
7429
7430         if (enable_preemption_timer)
7431                 vmx_update_hv_timer(vcpu, force_immediate_exit);
7432         else if (force_immediate_exit)
7433                 smp_send_reschedule(vcpu->cpu);
7434
7435         kvm_wait_lapic_expire(vcpu);
7436
7437         /* The actual VMENTER/EXIT is in the .noinstr.text section. */
7438         vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
7439
7440         /* All fields are clean at this point */
7441         if (kvm_is_using_evmcs()) {
7442                 current_evmcs->hv_clean_fields |=
7443                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7444
7445                 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
7446         }
7447
7448         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7449         if (vmx->host_debugctlmsr)
7450                 update_debugctlmsr(vmx->host_debugctlmsr);
7451
7452 #ifndef CONFIG_X86_64
7453         /*
7454          * The sysexit path does not restore ds/es, so we must set them to
7455          * a reasonable value ourselves.
7456          *
7457          * We can't defer this to vmx_prepare_switch_to_host() since that
7458          * function may be executed in interrupt context, which saves and
7459          * restore segments around it, nullifying its effect.
7460          */
7461         loadsegment(ds, __USER_DS);
7462         loadsegment(es, __USER_DS);
7463 #endif
7464
7465         pt_guest_exit(vmx);
7466
7467         kvm_load_host_xsave_state(vcpu);
7468
7469         if (is_guest_mode(vcpu)) {
7470                 /*
7471                  * Track VMLAUNCH/VMRESUME that have made past guest state
7472                  * checking.
7473                  */
7474                 if (vmx->nested.nested_run_pending &&
7475                     !vmx->exit_reason.failed_vmentry)
7476                         ++vcpu->stat.nested_run;
7477
7478                 vmx->nested.nested_run_pending = 0;
7479         }
7480
7481         if (unlikely(vmx->fail))
7482                 return EXIT_FASTPATH_NONE;
7483
7484         if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
7485                 kvm_machine_check();
7486
7487         trace_kvm_exit(vcpu, KVM_ISA_VMX);
7488
7489         if (unlikely(vmx->exit_reason.failed_vmentry))
7490                 return EXIT_FASTPATH_NONE;
7491
7492         vmx->loaded_vmcs->launched = 1;
7493
7494         vmx_recover_nmi_blocking(vmx);
7495         vmx_complete_interrupts(vmx);
7496
7497         return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
7498 }
7499
7500 void vmx_vcpu_free(struct kvm_vcpu *vcpu)
7501 {
7502         struct vcpu_vmx *vmx = to_vmx(vcpu);
7503
7504         if (enable_pml)
7505                 vmx_destroy_pml_buffer(vmx);
7506         free_vpid(vmx->vpid);
7507         nested_vmx_free_vcpu(vcpu);
7508         free_loaded_vmcs(vmx->loaded_vmcs);
7509         free_page((unsigned long)vmx->ve_info);
7510 }
7511
7512 int vmx_vcpu_create(struct kvm_vcpu *vcpu)
7513 {
7514         struct vmx_uret_msr *tsx_ctrl;
7515         struct vcpu_vmx *vmx;
7516         int i, err;
7517
7518         BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
7519         vmx = to_vmx(vcpu);
7520
7521         INIT_LIST_HEAD(&vmx->pi_wakeup_list);
7522
7523         err = -ENOMEM;
7524
7525         vmx->vpid = allocate_vpid();
7526
7527         /*
7528          * If PML is turned on, failure on enabling PML just results in failure
7529          * of creating the vcpu, therefore we can simplify PML logic (by
7530          * avoiding dealing with cases, such as enabling PML partially on vcpus
7531          * for the guest), etc.
7532          */
7533         if (enable_pml) {
7534                 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7535                 if (!vmx->pml_pg)
7536                         goto free_vpid;
7537         }
7538
7539         for (i = 0; i < kvm_nr_uret_msrs; ++i)
7540                 vmx->guest_uret_msrs[i].mask = -1ull;
7541         if (boot_cpu_has(X86_FEATURE_RTM)) {
7542                 /*
7543                  * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7544                  * Keep the host value unchanged to avoid changing CPUID bits
7545                  * under the host kernel's feet.
7546                  */
7547                 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7548                 if (tsx_ctrl)
7549                         tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7550         }
7551
7552         err = alloc_loaded_vmcs(&vmx->vmcs01);
7553         if (err < 0)
7554                 goto free_pml;
7555
7556         /*
7557          * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7558          * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7559          * feature only for vmcs01, KVM currently isn't equipped to realize any
7560          * performance benefits from enabling it for vmcs02.
7561          */
7562         if (kvm_is_using_evmcs() &&
7563             (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7564                 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7565
7566                 evmcs->hv_enlightenments_control.msr_bitmap = 1;
7567         }
7568
7569         /* The MSR bitmap starts with all ones */
7570         bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7571         bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7572
7573         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
7574 #ifdef CONFIG_X86_64
7575         vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
7576         vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
7577         vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
7578 #endif
7579         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7580         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7581         vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
7582         if (kvm_cstate_in_guest(vcpu->kvm)) {
7583                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
7584                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7585                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7586                 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
7587         }
7588
7589         vmx->loaded_vmcs = &vmx->vmcs01;
7590
7591         if (cpu_need_virtualize_apic_accesses(vcpu)) {
7592                 err = kvm_alloc_apic_access_page(vcpu->kvm);
7593                 if (err)
7594                         goto free_vmcs;
7595         }
7596
7597         if (enable_ept && !enable_unrestricted_guest) {
7598                 err = init_rmode_identity_map(vcpu->kvm);
7599                 if (err)
7600                         goto free_vmcs;
7601         }
7602
7603         err = -ENOMEM;
7604         if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
7605                 struct page *page;
7606
7607                 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
7608
7609                 /* ve_info must be page aligned. */
7610                 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7611                 if (!page)
7612                         goto free_vmcs;
7613
7614                 vmx->ve_info = page_to_virt(page);
7615         }
7616
7617         if (vmx_can_use_ipiv(vcpu))
7618                 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7619                            __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
7620
7621         return 0;
7622
7623 free_vmcs:
7624         free_loaded_vmcs(vmx->loaded_vmcs);
7625 free_pml:
7626         vmx_destroy_pml_buffer(vmx);
7627 free_vpid:
7628         free_vpid(vmx->vpid);
7629         return err;
7630 }
7631
7632 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7633 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7634
7635 int vmx_vm_init(struct kvm *kvm)
7636 {
7637         if (!ple_gap)
7638                 kvm->arch.pause_in_guest = true;
7639
7640         if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7641                 switch (l1tf_mitigation) {
7642                 case L1TF_MITIGATION_OFF:
7643                 case L1TF_MITIGATION_FLUSH_NOWARN:
7644                         /* 'I explicitly don't care' is set */
7645                         break;
7646                 case L1TF_MITIGATION_FLUSH:
7647                 case L1TF_MITIGATION_FLUSH_NOSMT:
7648                 case L1TF_MITIGATION_FULL:
7649                         /*
7650                          * Warn upon starting the first VM in a potentially
7651                          * insecure environment.
7652                          */
7653                         if (sched_smt_active())
7654                                 pr_warn_once(L1TF_MSG_SMT);
7655                         if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7656                                 pr_warn_once(L1TF_MSG_L1D);
7657                         break;
7658                 case L1TF_MITIGATION_FULL_FORCE:
7659                         /* Flush is enforced */
7660                         break;
7661                 }
7662         }
7663         return 0;
7664 }
7665
7666 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7667 {
7668         /*
7669          * Force UC for host MMIO regions, as allowing the guest to access MMIO
7670          * with cacheable accesses will result in Machine Checks.
7671          */
7672         if (is_mmio)
7673                 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
7674
7675         /*
7676          * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
7677          * device attached.  Letting the guest control memory types on Intel
7678          * CPUs may result in unexpected behavior, and so KVM's ABI is to trust
7679          * the guest to behave only as a last resort.
7680          */
7681         if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
7682                 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
7683
7684         return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
7685 }
7686
7687 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
7688 {
7689         /*
7690          * These bits in the secondary execution controls field
7691          * are dynamic, the others are mostly based on the hypervisor
7692          * architecture and the guest's CPUID.  Do not touch the
7693          * dynamic bits.
7694          */
7695         u32 mask =
7696                 SECONDARY_EXEC_SHADOW_VMCS |
7697                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7698                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7699                 SECONDARY_EXEC_DESC;
7700
7701         u32 cur_ctl = secondary_exec_controls_get(vmx);
7702
7703         secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7704 }
7705
7706 /*
7707  * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7708  * (indicating "allowed-1") if they are supported in the guest's CPUID.
7709  */
7710 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7711 {
7712         struct vcpu_vmx *vmx = to_vmx(vcpu);
7713         struct kvm_cpuid_entry2 *entry;
7714
7715         vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7716         vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7717
7718 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {            \
7719         if (entry && (entry->_reg & (_cpuid_mask)))                     \
7720                 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);     \
7721 } while (0)
7722
7723         entry = kvm_find_cpuid_entry(vcpu, 0x1);
7724         cr4_fixed1_update(X86_CR4_VME,        edx, feature_bit(VME));
7725         cr4_fixed1_update(X86_CR4_PVI,        edx, feature_bit(VME));
7726         cr4_fixed1_update(X86_CR4_TSD,        edx, feature_bit(TSC));
7727         cr4_fixed1_update(X86_CR4_DE,         edx, feature_bit(DE));
7728         cr4_fixed1_update(X86_CR4_PSE,        edx, feature_bit(PSE));
7729         cr4_fixed1_update(X86_CR4_PAE,        edx, feature_bit(PAE));
7730         cr4_fixed1_update(X86_CR4_MCE,        edx, feature_bit(MCE));
7731         cr4_fixed1_update(X86_CR4_PGE,        edx, feature_bit(PGE));
7732         cr4_fixed1_update(X86_CR4_OSFXSR,     edx, feature_bit(FXSR));
7733         cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7734         cr4_fixed1_update(X86_CR4_VMXE,       ecx, feature_bit(VMX));
7735         cr4_fixed1_update(X86_CR4_SMXE,       ecx, feature_bit(SMX));
7736         cr4_fixed1_update(X86_CR4_PCIDE,      ecx, feature_bit(PCID));
7737         cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, feature_bit(XSAVE));
7738
7739         entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
7740         cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, feature_bit(FSGSBASE));
7741         cr4_fixed1_update(X86_CR4_SMEP,       ebx, feature_bit(SMEP));
7742         cr4_fixed1_update(X86_CR4_SMAP,       ebx, feature_bit(SMAP));
7743         cr4_fixed1_update(X86_CR4_PKE,        ecx, feature_bit(PKU));
7744         cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
7745         cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
7746
7747         entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
7748         cr4_fixed1_update(X86_CR4_LAM_SUP,    eax, feature_bit(LAM));
7749
7750 #undef cr4_fixed1_update
7751 }
7752
7753 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7754 {
7755         struct vcpu_vmx *vmx = to_vmx(vcpu);
7756         struct kvm_cpuid_entry2 *best = NULL;
7757         int i;
7758
7759         for (i = 0; i < PT_CPUID_LEAVES; i++) {
7760                 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
7761                 if (!best)
7762                         return;
7763                 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7764                 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7765                 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7766                 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7767         }
7768
7769         /* Get the number of configurable Address Ranges for filtering */
7770         vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
7771                                                 PT_CAP_num_address_ranges);
7772
7773         /* Initialize and clear the no dependency bits */
7774         vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7775                         RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
7776                         RTIT_CTL_BRANCH_EN);
7777
7778         /*
7779          * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7780          * will inject an #GP
7781          */
7782         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7783                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7784
7785         /*
7786          * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7787          * PSBFreq can be set
7788          */
7789         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7790                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7791                                 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7792
7793         /*
7794          * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
7795          */
7796         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7797                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7798                                               RTIT_CTL_MTC_RANGE);
7799
7800         /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7801         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7802                 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7803                                                         RTIT_CTL_PTW_EN);
7804
7805         /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7806         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7807                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7808
7809         /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7810         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7811                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7812
7813         /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
7814         if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7815                 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7816
7817         /* unmask address range configure area */
7818         for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
7819                 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7820 }
7821
7822 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7823 {
7824         struct vcpu_vmx *vmx = to_vmx(vcpu);
7825
7826         /*
7827          * XSAVES is effectively enabled if and only if XSAVE is also exposed
7828          * to the guest.  XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
7829          * set if and only if XSAVE is supported.
7830          */
7831         if (boot_cpu_has(X86_FEATURE_XSAVE) &&
7832             guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
7833                 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
7834
7835         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
7836         kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM);
7837
7838         vmx_setup_uret_msrs(vmx);
7839
7840         if (cpu_has_secondary_exec_ctrls())
7841                 vmcs_set_secondary_exec_control(vmx,
7842                                                 vmx_secondary_exec_control(vmx));
7843
7844         if (guest_can_use(vcpu, X86_FEATURE_VMX))
7845                 vmx->msr_ia32_feature_control_valid_bits |=
7846                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7847                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7848         else
7849                 vmx->msr_ia32_feature_control_valid_bits &=
7850                         ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7851                           FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7852
7853         if (guest_can_use(vcpu, X86_FEATURE_VMX))
7854                 nested_vmx_cr_fixed1_bits_update(vcpu);
7855
7856         if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7857                         guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7858                 update_intel_pt_cfg(vcpu);
7859
7860         if (boot_cpu_has(X86_FEATURE_RTM)) {
7861                 struct vmx_uret_msr *msr;
7862                 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7863                 if (msr) {
7864                         bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7865                         vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7866                 }
7867         }
7868
7869         if (kvm_cpu_cap_has(X86_FEATURE_XFD))
7870                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
7871                                           !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
7872
7873         if (boot_cpu_has(X86_FEATURE_IBPB))
7874                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
7875                                           !guest_has_pred_cmd_msr(vcpu));
7876
7877         if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
7878                 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
7879                                           !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
7880
7881         set_cr4_guest_host_mask(vmx);
7882
7883         vmx_write_encls_bitmap(vcpu, NULL);
7884         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
7885                 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
7886         else
7887                 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
7888
7889         if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
7890                 vmx->msr_ia32_feature_control_valid_bits |=
7891                         FEAT_CTL_SGX_LC_ENABLED;
7892         else
7893                 vmx->msr_ia32_feature_control_valid_bits &=
7894                         ~FEAT_CTL_SGX_LC_ENABLED;
7895
7896         /* Refresh #PF interception to account for MAXPHYADDR changes. */
7897         vmx_update_exception_bitmap(vcpu);
7898 }
7899
7900 static __init u64 vmx_get_perf_capabilities(void)
7901 {
7902         u64 perf_cap = PMU_CAP_FW_WRITES;
7903         u64 host_perf_cap = 0;
7904
7905         if (!enable_pmu)
7906                 return 0;
7907
7908         if (boot_cpu_has(X86_FEATURE_PDCM))
7909                 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
7910
7911         if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
7912                 x86_perf_get_lbr(&vmx_lbr_caps);
7913
7914                 /*
7915                  * KVM requires LBR callstack support, as the overhead due to
7916                  * context switching LBRs without said support is too high.
7917                  * See intel_pmu_create_guest_lbr_event() for more info.
7918                  */
7919                 if (!vmx_lbr_caps.has_callstack)
7920                         memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
7921                 else if (vmx_lbr_caps.nr)
7922                         perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
7923         }
7924
7925         if (vmx_pebs_supported()) {
7926                 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
7927
7928                 /*
7929                  * Disallow adaptive PEBS as it is functionally broken, can be
7930                  * used by the guest to read *host* LBRs, and can be used to
7931                  * bypass userspace event filters.  To correctly and safely
7932                  * support adaptive PEBS, KVM needs to:
7933                  *
7934                  * 1. Account for the ADAPTIVE flag when (re)programming fixed
7935                  *    counters.
7936                  *
7937                  * 2. Gain support from perf (or take direct control of counter
7938                  *    programming) to support events without adaptive PEBS
7939                  *    enabled for the hardware counter.
7940                  *
7941                  * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
7942                  *    adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
7943                  *
7944                  * 4. Document which PMU events are effectively exposed to the
7945                  *    guest via adaptive PEBS, and make adaptive PEBS mutually
7946                  *    exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
7947                  */
7948                 perf_cap &= ~PERF_CAP_PEBS_BASELINE;
7949         }
7950
7951         return perf_cap;
7952 }
7953
7954 static __init void vmx_set_cpu_caps(void)
7955 {
7956         kvm_set_cpu_caps();
7957
7958         /* CPUID 0x1 */
7959         if (nested)
7960                 kvm_cpu_cap_set(X86_FEATURE_VMX);
7961
7962         /* CPUID 0x7 */
7963         if (kvm_mpx_supported())
7964                 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
7965         if (!cpu_has_vmx_invpcid())
7966                 kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
7967         if (vmx_pt_mode_is_host_guest())
7968                 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
7969         if (vmx_pebs_supported()) {
7970                 kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
7971                 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
7972         }
7973
7974         if (!enable_pmu)
7975                 kvm_cpu_cap_clear(X86_FEATURE_PDCM);
7976         kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
7977
7978         if (!enable_sgx) {
7979                 kvm_cpu_cap_clear(X86_FEATURE_SGX);
7980                 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
7981                 kvm_cpu_cap_clear(X86_FEATURE_SGX1);
7982                 kvm_cpu_cap_clear(X86_FEATURE_SGX2);
7983                 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
7984         }
7985
7986         if (vmx_umip_emulated())
7987                 kvm_cpu_cap_set(X86_FEATURE_UMIP);
7988
7989         /* CPUID 0xD.1 */
7990         kvm_caps.supported_xss = 0;
7991         if (!cpu_has_vmx_xsaves())
7992                 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7993
7994         /* CPUID 0x80000001 and 0x7 (RDPID) */
7995         if (!cpu_has_vmx_rdtscp()) {
7996                 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7997                 kvm_cpu_cap_clear(X86_FEATURE_RDPID);
7998         }
7999
8000         if (cpu_has_vmx_waitpkg())
8001                 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
8002 }
8003
8004 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
8005                                   struct x86_instruction_info *info)
8006 {
8007         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8008         unsigned short port;
8009         bool intercept;
8010         int size;
8011
8012         if (info->intercept == x86_intercept_in ||
8013             info->intercept == x86_intercept_ins) {
8014                 port = info->src_val;
8015                 size = info->dst_bytes;
8016         } else {
8017                 port = info->dst_val;
8018                 size = info->src_bytes;
8019         }
8020
8021         /*
8022          * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
8023          * VM-exits depend on the 'unconditional IO exiting' VM-execution
8024          * control.
8025          *
8026          * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
8027          */
8028         if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
8029                 intercept = nested_cpu_has(vmcs12,
8030                                            CPU_BASED_UNCOND_IO_EXITING);
8031         else
8032                 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
8033
8034         /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
8035         return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
8036 }
8037
8038 int vmx_check_intercept(struct kvm_vcpu *vcpu,
8039                         struct x86_instruction_info *info,
8040                         enum x86_intercept_stage stage,
8041                         struct x86_exception *exception)
8042 {
8043         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8044
8045         switch (info->intercept) {
8046         /*
8047          * RDPID causes #UD if disabled through secondary execution controls.
8048          * Because it is marked as EmulateOnUD, we need to intercept it here.
8049          * Note, RDPID is hidden behind ENABLE_RDTSCP.
8050          */
8051         case x86_intercept_rdpid:
8052                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
8053                         exception->vector = UD_VECTOR;
8054                         exception->error_code_valid = false;
8055                         return X86EMUL_PROPAGATE_FAULT;
8056                 }
8057                 break;
8058
8059         case x86_intercept_in:
8060         case x86_intercept_ins:
8061         case x86_intercept_out:
8062         case x86_intercept_outs:
8063                 return vmx_check_intercept_io(vcpu, info);
8064
8065         case x86_intercept_lgdt:
8066         case x86_intercept_lidt:
8067         case x86_intercept_lldt:
8068         case x86_intercept_ltr:
8069         case x86_intercept_sgdt:
8070         case x86_intercept_sidt:
8071         case x86_intercept_sldt:
8072         case x86_intercept_str:
8073                 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
8074                         return X86EMUL_CONTINUE;
8075
8076                 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
8077                 break;
8078
8079         case x86_intercept_pause:
8080                 /*
8081                  * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
8082                  * with vanilla NOPs in the emulator.  Apply the interception
8083                  * check only to actual PAUSE instructions.  Don't check
8084                  * PAUSE-loop-exiting, software can't expect a given PAUSE to
8085                  * exit, i.e. KVM is within its rights to allow L2 to execute
8086                  * the PAUSE.
8087                  */
8088                 if ((info->rep_prefix != REPE_PREFIX) ||
8089                     !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
8090                         return X86EMUL_CONTINUE;
8091
8092                 break;
8093
8094         /* TODO: check more intercepts... */
8095         default:
8096                 break;
8097         }
8098
8099         return X86EMUL_UNHANDLEABLE;
8100 }
8101
8102 #ifdef CONFIG_X86_64
8103 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
8104 static inline int u64_shl_div_u64(u64 a, unsigned int shift,
8105                                   u64 divisor, u64 *result)
8106 {
8107         u64 low = a << shift, high = a >> (64 - shift);
8108
8109         /* To avoid the overflow on divq */
8110         if (high >= divisor)
8111                 return 1;
8112
8113         /* Low hold the result, high hold rem which is discarded */
8114         asm("divq %2\n\t" : "=a" (low), "=d" (high) :
8115             "rm" (divisor), "0" (low), "1" (high));
8116         *result = low;
8117
8118         return 0;
8119 }
8120
8121 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
8122                      bool *expired)
8123 {
8124         struct vcpu_vmx *vmx;
8125         u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
8126         struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
8127
8128         vmx = to_vmx(vcpu);
8129         tscl = rdtsc();
8130         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
8131         delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
8132         lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
8133                                                     ktimer->timer_advance_ns);
8134
8135         if (delta_tsc > lapic_timer_advance_cycles)
8136                 delta_tsc -= lapic_timer_advance_cycles;
8137         else
8138                 delta_tsc = 0;
8139
8140         /* Convert to host delta tsc if tsc scaling is enabled */
8141         if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
8142             delta_tsc && u64_shl_div_u64(delta_tsc,
8143                                 kvm_caps.tsc_scaling_ratio_frac_bits,
8144                                 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
8145                 return -ERANGE;
8146
8147         /*
8148          * If the delta tsc can't fit in the 32 bit after the multi shift,
8149          * we can't use the preemption timer.
8150          * It's possible that it fits on later vmentries, but checking
8151          * on every vmentry is costly so we just use an hrtimer.
8152          */
8153         if (delta_tsc >> (cpu_preemption_timer_multi + 32))
8154                 return -ERANGE;
8155
8156         vmx->hv_deadline_tsc = tscl + delta_tsc;
8157         *expired = !delta_tsc;
8158         return 0;
8159 }
8160
8161 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8162 {
8163         to_vmx(vcpu)->hv_deadline_tsc = -1;
8164 }
8165 #endif
8166
8167 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
8168 {
8169         struct vcpu_vmx *vmx = to_vmx(vcpu);
8170
8171         if (WARN_ON_ONCE(!enable_pml))
8172                 return;
8173
8174         if (is_guest_mode(vcpu)) {
8175                 vmx->nested.update_vmcs01_cpu_dirty_logging = true;
8176                 return;
8177         }
8178
8179         /*
8180          * Note, nr_memslots_dirty_logging can be changed concurrent with this
8181          * code, but in that case another update request will be made and so
8182          * the guest will never run with a stale PML value.
8183          */
8184         if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
8185                 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8186         else
8187                 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8188 }
8189
8190 void vmx_setup_mce(struct kvm_vcpu *vcpu)
8191 {
8192         if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8193                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8194                         FEAT_CTL_LMCE_ENABLED;
8195         else
8196                 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8197                         ~FEAT_CTL_LMCE_ENABLED;
8198 }
8199
8200 #ifdef CONFIG_KVM_SMM
8201 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
8202 {
8203         /* we need a nested vmexit to enter SMM, postpone if run is pending */
8204         if (to_vmx(vcpu)->nested.nested_run_pending)
8205                 return -EBUSY;
8206         return !is_smm(vcpu);
8207 }
8208
8209 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
8210 {
8211         struct vcpu_vmx *vmx = to_vmx(vcpu);
8212
8213         /*
8214          * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8215          * SMI and RSM.  Using the common VM-Exit + VM-Enter routines is wrong
8216          * SMI and RSM only modify state that is saved and restored via SMRAM.
8217          * E.g. most MSRs are left untouched, but many are modified by VM-Exit
8218          * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8219          */
8220         vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8221         if (vmx->nested.smm.guest_mode)
8222                 nested_vmx_vmexit(vcpu, -1, 0, 0);
8223
8224         vmx->nested.smm.vmxon = vmx->nested.vmxon;
8225         vmx->nested.vmxon = false;
8226         vmx_clear_hlt(vcpu);
8227         return 0;
8228 }
8229
8230 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
8231 {
8232         struct vcpu_vmx *vmx = to_vmx(vcpu);
8233         int ret;
8234
8235         if (vmx->nested.smm.vmxon) {
8236                 vmx->nested.vmxon = true;
8237                 vmx->nested.smm.vmxon = false;
8238         }
8239
8240         if (vmx->nested.smm.guest_mode) {
8241                 ret = nested_vmx_enter_non_root_mode(vcpu, false);
8242                 if (ret)
8243                         return ret;
8244
8245                 vmx->nested.nested_run_pending = 1;
8246                 vmx->nested.smm.guest_mode = false;
8247         }
8248         return 0;
8249 }
8250
8251 void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
8252 {
8253         /* RSM will cause a vmexit anyway.  */
8254 }
8255 #endif
8256
8257 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8258 {
8259         return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
8260 }
8261
8262 void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8263 {
8264         if (is_guest_mode(vcpu)) {
8265                 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8266
8267                 if (hrtimer_try_to_cancel(timer) == 1)
8268                         hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
8269         }
8270 }
8271
8272 void vmx_hardware_unsetup(void)
8273 {
8274         kvm_set_posted_intr_wakeup_handler(NULL);
8275
8276         if (nested)
8277                 nested_vmx_hardware_unsetup();
8278
8279         free_kvm_area();
8280 }
8281
8282 void vmx_vm_destroy(struct kvm *kvm)
8283 {
8284         struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8285
8286         free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
8287 }
8288
8289 /*
8290  * Note, the SDM states that the linear address is masked *after* the modified
8291  * canonicality check, whereas KVM masks (untags) the address and then performs
8292  * a "normal" canonicality check.  Functionally, the two methods are identical,
8293  * and when the masking occurs relative to the canonicality check isn't visible
8294  * to software, i.e. KVM's behavior doesn't violate the SDM.
8295  */
8296 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
8297 {
8298         int lam_bit;
8299         unsigned long cr3_bits;
8300
8301         if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
8302                 return gva;
8303
8304         if (!is_64_bit_mode(vcpu))
8305                 return gva;
8306
8307         /*
8308          * Bit 63 determines if the address should be treated as user address
8309          * or a supervisor address.
8310          */
8311         if (!(gva & BIT_ULL(63))) {
8312                 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
8313                 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
8314                         return gva;
8315
8316                 /* LAM_U48 is ignored if LAM_U57 is set. */
8317                 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
8318         } else {
8319                 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
8320                         return gva;
8321
8322                 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
8323         }
8324
8325         /*
8326          * Untag the address by sign-extending the lam_bit, but NOT to bit 63.
8327          * Bit 63 is retained from the raw virtual address so that untagging
8328          * doesn't change a user access to a supervisor access, and vice versa.
8329          */
8330         return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
8331 }
8332
8333 static unsigned int vmx_handle_intel_pt_intr(void)
8334 {
8335         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8336
8337         /* '0' on failure so that the !PT case can use a RET0 static call. */
8338         if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
8339                 return 0;
8340
8341         kvm_make_request(KVM_REQ_PMI, vcpu);
8342         __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8343                   (unsigned long *)&vcpu->arch.pmu.global_status);
8344         return 1;
8345 }
8346
8347 static __init void vmx_setup_user_return_msrs(void)
8348 {
8349
8350         /*
8351          * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8352          * will emulate SYSCALL in legacy mode if the vendor string in guest
8353          * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8354          * support this emulation, MSR_STAR is included in the list for i386,
8355          * but is never loaded into hardware.  MSR_CSTAR is also never loaded
8356          * into hardware and is here purely for emulation purposes.
8357          */
8358         const u32 vmx_uret_msrs_list[] = {
8359         #ifdef CONFIG_X86_64
8360                 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8361         #endif
8362                 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8363                 MSR_IA32_TSX_CTRL,
8364         };
8365         int i;
8366
8367         BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8368
8369         for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8370                 kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
8371 }
8372
8373 static void __init vmx_setup_me_spte_mask(void)
8374 {
8375         u64 me_mask = 0;
8376
8377         /*
8378          * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8379          * kvm_host.maxphyaddr.  On MKTME and/or TDX capable systems,
8380          * boot_cpu_data.x86_phys_bits holds the actual physical address
8381          * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
8382          * MAXPHYADDR reported by CPUID.  Those bits between are KeyID bits.
8383          */
8384         if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
8385                 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
8386                                     kvm_host.maxphyaddr - 1);
8387
8388         /*
8389          * Unlike SME, host kernel doesn't support setting up any
8390          * MKTME KeyID on Intel platforms.  No memory encryption
8391          * bits should be included into the SPTE.
8392          */
8393         kvm_mmu_set_me_spte_mask(0, me_mask);
8394 }
8395
8396 __init int vmx_hardware_setup(void)
8397 {
8398         unsigned long host_bndcfgs;
8399         struct desc_ptr dt;
8400         int r;
8401
8402         store_idt(&dt);
8403         host_idt_base = dt.address;
8404
8405         vmx_setup_user_return_msrs();
8406
8407         if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8408                 return -EIO;
8409
8410         if (boot_cpu_has(X86_FEATURE_NX))
8411                 kvm_enable_efer_bits(EFER_NX);
8412
8413         if (boot_cpu_has(X86_FEATURE_MPX)) {
8414                 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
8415                 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
8416         }
8417
8418         if (!cpu_has_vmx_mpx())
8419                 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
8420                                              XFEATURE_MASK_BNDCSR);
8421
8422         if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8423             !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8424                 enable_vpid = 0;
8425
8426         if (!cpu_has_vmx_ept() ||
8427             !cpu_has_vmx_ept_4levels() ||
8428             !cpu_has_vmx_ept_mt_wb() ||
8429             !cpu_has_vmx_invept_global())
8430                 enable_ept = 0;
8431
8432         /* NX support is required for shadow paging. */
8433         if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8434                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
8435                 return -EOPNOTSUPP;
8436         }
8437
8438         if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8439                 enable_ept_ad_bits = 0;
8440
8441         if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8442                 enable_unrestricted_guest = 0;
8443
8444         if (!cpu_has_vmx_flexpriority())
8445                 flexpriority_enabled = 0;
8446
8447         if (!cpu_has_virtual_nmis())
8448                 enable_vnmi = 0;
8449
8450 #ifdef CONFIG_X86_SGX_KVM
8451         if (!cpu_has_vmx_encls_vmexit())
8452                 enable_sgx = false;
8453 #endif
8454
8455         /*
8456          * set_apic_access_page_addr() is used to reload apic access
8457          * page upon invalidation.  No need to do anything if not
8458          * using the APIC_ACCESS_ADDR VMCS field.
8459          */
8460         if (!flexpriority_enabled)
8461                 vt_x86_ops.set_apic_access_page_addr = NULL;
8462
8463         if (!cpu_has_vmx_tpr_shadow())
8464                 vt_x86_ops.update_cr8_intercept = NULL;
8465
8466 #if IS_ENABLED(CONFIG_HYPERV)
8467         if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8468             && enable_ept) {
8469                 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
8470                 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
8471         }
8472 #endif
8473
8474         if (!cpu_has_vmx_ple()) {
8475                 ple_gap = 0;
8476                 ple_window = 0;
8477                 ple_window_grow = 0;
8478                 ple_window_max = 0;
8479                 ple_window_shrink = 0;
8480         }
8481
8482         if (!cpu_has_vmx_apicv())
8483                 enable_apicv = 0;
8484         if (!enable_apicv)
8485                 vt_x86_ops.sync_pir_to_irr = NULL;
8486
8487         if (!enable_apicv || !cpu_has_vmx_ipiv())
8488                 enable_ipiv = false;
8489
8490         if (cpu_has_vmx_tsc_scaling())
8491                 kvm_caps.has_tsc_control = true;
8492
8493         kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8494         kvm_caps.tsc_scaling_ratio_frac_bits = 48;
8495         kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
8496         kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
8497
8498         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8499
8500         if (enable_ept)
8501                 kvm_mmu_set_ept_masks(enable_ept_ad_bits,
8502                                       cpu_has_vmx_ept_execute_only());
8503
8504         /*
8505          * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8506          * bits to shadow_zero_check.
8507          */
8508         vmx_setup_me_spte_mask();
8509
8510         kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
8511                           ept_caps_to_lpage_level(vmx_capability.ept));
8512
8513         /*
8514          * Only enable PML when hardware supports PML feature, and both EPT
8515          * and EPT A/D bit features are enabled -- PML depends on them to work.
8516          */
8517         if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8518                 enable_pml = 0;
8519
8520         if (!enable_pml)
8521                 vt_x86_ops.cpu_dirty_log_size = 0;
8522
8523         if (!cpu_has_vmx_preemption_timer())
8524                 enable_preemption_timer = false;
8525
8526         if (enable_preemption_timer) {
8527                 u64 use_timer_freq = 5000ULL * 1000 * 1000;
8528
8529                 cpu_preemption_timer_multi =
8530                         vmx_misc_preemption_timer_rate(vmcs_config.misc);
8531
8532                 if (tsc_khz)
8533                         use_timer_freq = (u64)tsc_khz * 1000;
8534                 use_timer_freq >>= cpu_preemption_timer_multi;
8535
8536                 /*
8537                  * KVM "disables" the preemption timer by setting it to its max
8538                  * value.  Don't use the timer if it might cause spurious exits
8539                  * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8540                  */
8541                 if (use_timer_freq > 0xffffffffu / 10)
8542                         enable_preemption_timer = false;
8543         }
8544
8545         if (!enable_preemption_timer) {
8546                 vt_x86_ops.set_hv_timer = NULL;
8547                 vt_x86_ops.cancel_hv_timer = NULL;
8548         }
8549
8550         kvm_caps.supported_mce_cap |= MCG_LMCE_P;
8551         kvm_caps.supported_mce_cap |= MCG_CMCI_P;
8552
8553         if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8554                 return -EINVAL;
8555         if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
8556                 pt_mode = PT_MODE_SYSTEM;
8557         if (pt_mode == PT_MODE_HOST_GUEST)
8558                 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8559         else
8560                 vt_init_ops.handle_intel_pt_intr = NULL;
8561
8562         setup_default_sgx_lepubkeyhash();
8563
8564         if (nested) {
8565                 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
8566
8567                 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8568                 if (r)
8569                         return r;
8570         }
8571
8572         vmx_set_cpu_caps();
8573
8574         r = alloc_kvm_area();
8575         if (r && nested)
8576                 nested_vmx_hardware_unsetup();
8577
8578         kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
8579
8580         return r;
8581 }
8582
8583 static void vmx_cleanup_l1d_flush(void)
8584 {
8585         if (vmx_l1d_flush_pages) {
8586                 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8587                 vmx_l1d_flush_pages = NULL;
8588         }
8589         /* Restore state so sysfs ignores VMX */
8590         l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
8591 }
8592
8593 static void __vmx_exit(void)
8594 {
8595         allow_smaller_maxphyaddr = false;
8596
8597         vmx_cleanup_l1d_flush();
8598 }
8599
8600 static void vmx_exit(void)
8601 {
8602         kvm_exit();
8603         __vmx_exit();
8604         kvm_x86_vendor_exit();
8605
8606 }
8607 module_exit(vmx_exit);
8608
8609 static int __init vmx_init(void)
8610 {
8611         int r, cpu;
8612
8613         if (!kvm_is_vmx_supported())
8614                 return -EOPNOTSUPP;
8615
8616         /*
8617          * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
8618          * to unwind if a later step fails.
8619          */
8620         hv_init_evmcs();
8621
8622         r = kvm_x86_vendor_init(&vt_init_ops);
8623         if (r)
8624                 return r;
8625
8626         /*
8627          * Must be called after common x86 init so enable_ept is properly set
8628          * up. Hand the parameter mitigation value in which was stored in
8629          * the pre module init parser. If no parameter was given, it will
8630          * contain 'auto' which will be turned into the default 'cond'
8631          * mitigation mode.
8632          */
8633         r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
8634         if (r)
8635                 goto err_l1d_flush;
8636
8637         for_each_possible_cpu(cpu) {
8638                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
8639
8640                 pi_init_cpu(cpu);
8641         }
8642
8643         vmx_check_vmcs12_offsets();
8644
8645         /*
8646          * Shadow paging doesn't have a (further) performance penalty
8647          * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8648          * by default
8649          */
8650         if (!enable_ept)
8651                 allow_smaller_maxphyaddr = true;
8652
8653         /*
8654          * Common KVM initialization _must_ come last, after this, /dev/kvm is
8655          * exposed to userspace!
8656          */
8657         r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
8658                      THIS_MODULE);
8659         if (r)
8660                 goto err_kvm_init;
8661
8662         return 0;
8663
8664 err_kvm_init:
8665         __vmx_exit();
8666 err_l1d_flush:
8667         kvm_x86_vendor_exit();
8668         return r;
8669 }
8670 module_init(vmx_init);