]> Git Repo - linux.git/commitdiff
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <[email protected]>
Mon, 16 Feb 2015 22:58:12 +0000 (14:58 -0800)
committerLinus Torvalds <[email protected]>
Mon, 16 Feb 2015 22:58:12 +0000 (14:58 -0800)
Pull x86 perf updates from Ingo Molnar:
 "This series tightens up RDPMC permissions: currently even highly
  sandboxed x86 execution environments (such as seccomp) have permission
  to execute RDPMC, which may leak various perf events / PMU state such
  as timing information and other CPU execution details.

  This 'all is allowed' RDPMC mode is still preserved as the
  (non-default) /sys/devices/cpu/rdpmc=2 setting.  The new default is
  that RDPMC access is only allowed if a perf event is mmap-ed (which is
  needed to correctly interpret RDPMC counter values in any case).

  As a side effect of these changes CR4 handling is cleaned up in the
  x86 code and a shadow copy of the CR4 value is added.

  The extra CR4 manipulation adds ~ <50ns to the context switch cost
  between rdpmc-capable and rdpmc-non-capable mms"

* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  perf/x86: Add /sys/devices/cpu/rdpmc=2 to allow rdpmc for all tasks
  perf/x86: Only allow rdpmc if a perf_event is mapped
  perf: Pass the event to arch_perf_update_userpage()
  perf: Add pmu callbacks to track event mapping and unmapping
  x86: Add a comment clarifying LDT context switching
  x86: Store a per-cpu shadow copy of CR4
  x86: Clean up cr4 manipulation

1  2 
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/head64.c
arch/x86/kernel/i387.c
arch/x86/kernel/setup.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/mm/init.c
include/linux/perf_event.h
kernel/events/core.c

index b15bffcaba6d41fdcb1fff7d813aa3fd09f839ac,07f2fc3c13a4d614b76fdad3b99ee6ec8e9c3683..b5c8ff5e9dfcad79075a1af5f41b0ac0ee37231b
@@@ -19,6 -19,7 +19,7 @@@
  #include <asm/archrandom.h>
  #include <asm/hypervisor.h>
  #include <asm/processor.h>
+ #include <asm/tlbflush.h>
  #include <asm/debugreg.h>
  #include <asm/sections.h>
  #include <asm/vsyscall.h>
@@@ -278,7 -279,7 +279,7 @@@ __setup("nosmep", setup_disable_smep)
  static __always_inline void setup_smep(struct cpuinfo_x86 *c)
  {
        if (cpu_has(c, X86_FEATURE_SMEP))
-               set_in_cr4(X86_CR4_SMEP);
+               cr4_set_bits(X86_CR4_SMEP);
  }
  
  static __init int setup_disable_smap(char *arg)
@@@ -298,9 -299,9 +299,9 @@@ static __always_inline void setup_smap(
  
        if (cpu_has(c, X86_FEATURE_SMAP)) {
  #ifdef CONFIG_X86_SMAP
-               set_in_cr4(X86_CR4_SMAP);
+               cr4_set_bits(X86_CR4_SMAP);
  #else
-               clear_in_cr4(X86_CR4_SMAP);
+               cr4_clear_bits(X86_CR4_SMAP);
  #endif
        }
  }
@@@ -491,18 -492,17 +492,18 @@@ u16 __read_mostly tlb_lld_2m[NR_INFO]
  u16 __read_mostly tlb_lld_4m[NR_INFO];
  u16 __read_mostly tlb_lld_1g[NR_INFO];
  
 -void cpu_detect_tlb(struct cpuinfo_x86 *c)
 +static void cpu_detect_tlb(struct cpuinfo_x86 *c)
  {
        if (this_cpu->c_detect_tlb)
                this_cpu->c_detect_tlb(c);
  
 -      printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
 -              "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
 +      pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
                tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
 -              tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
 -              tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
 -              tlb_lld_1g[ENTRIES]);
 +              tlb_lli_4m[ENTRIES]);
 +
 +      pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
 +              tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
 +              tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
  }
  
  void detect_ht(struct cpuinfo_x86 *c)
@@@ -1294,6 -1294,12 +1295,12 @@@ void cpu_init(void
  
        wait_for_master_cpu(cpu);
  
+       /*
+        * Initialize the CR4 shadow before doing anything that could
+        * try to read it.
+        */
+       cr4_init_shadow();
        /*
         * Load microcode on this cpu if a valid microcode is available.
         * This is early microcode loading procedure.
  
        pr_debug("Initializing CPU#%d\n", cpu);
  
-       clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+       cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
  
        /*
         * Initialize the per-CPU GDT with the boot GDT,
        barrier();
  
        x86_configure_nx();
 -      enable_x2apic();
 +      x2apic_setup();
  
        /*
         * set up and load the per-CPU TSS
@@@ -1394,7 -1400,7 +1401,7 @@@ void cpu_init(void
        printk(KERN_INFO "Initializing CPU#%d\n", cpu);
  
        if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
-               clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+               cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
  
        load_current_idt();
        switch_to_new_gdt(cpu);
index cdfed7953963fa31251c24c254f55a267951bf02,15ad3ed1a3cd3f4e7f626c2f77c079f6d0de3249..3be9fa69f8758d61872888f0fcd9bc26692ae0ae
@@@ -44,6 -44,7 +44,7 @@@
  
  #include <asm/processor.h>
  #include <asm/traps.h>
+ #include <asm/tlbflush.h>
  #include <asm/mce.h>
  #include <asm/msr.h>
  
@@@ -116,7 -117,7 +117,7 @@@ static void (*quirk_no_way_out)(int ban
   * CPU/chipset specific EDAC code can register a notifier call here to print
   * MCE errors in a human-readable form.
   */
 -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 +static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
  
  /* Do initial initialization of a struct mce */
  void mce_setup(struct mce *m)
@@@ -312,7 -313,7 +313,7 @@@ static void wait_for_panic(void
        panic("Panicing machine check CPU died");
  }
  
 -static void mce_panic(char *msg, struct mce *final, char *exp)
 +static void mce_panic(const char *msg, struct mce *final, char *exp)
  {
        int i, apei_err = 0;
  
@@@ -530,7 -531,7 +531,7 @@@ static void mce_schedule_work(void
                schedule_work(this_cpu_ptr(&mce_work));
  }
  
 -DEFINE_PER_CPU(struct irq_work, mce_irq_work);
 +static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
  
  static void mce_irq_work_cb(struct irq_work *entry)
  {
@@@ -736,7 -737,7 +737,7 @@@ static atomic_t mce_callin
  /*
   * Check if a timeout waiting for other CPUs happened.
   */
 -static int mce_timed_out(u64 *t)
 +static int mce_timed_out(u64 *t, const char *msg)
  {
        /*
         * The others already did panic for some reason.
                goto out;
        if ((s64)*t < SPINUNIT) {
                if (mca_cfg.tolerant <= 1)
 -                      mce_panic("Timeout synchronizing machine check over CPUs",
 -                                NULL, NULL);
 +                      mce_panic(msg, NULL, NULL);
                cpu_missing = 1;
                return 1;
        }
@@@ -867,8 -869,7 +868,8 @@@ static int mce_start(int *no_way_out
         * Wait for everyone.
         */
        while (atomic_read(&mce_callin) != cpus) {
 -              if (mce_timed_out(&timeout)) {
 +              if (mce_timed_out(&timeout,
 +                                "Timeout: Not all CPUs entered broadcast exception handler")) {
                        atomic_set(&global_nwo, 0);
                        return -1;
                }
                 * only seen by one CPU before cleared, avoiding duplicates.
                 */
                while (atomic_read(&mce_executing) < order) {
 -                      if (mce_timed_out(&timeout)) {
 +                      if (mce_timed_out(&timeout,
 +                                        "Timeout: Subject CPUs unable to finish machine check processing")) {
                                atomic_set(&global_nwo, 0);
                                return -1;
                        }
@@@ -938,8 -938,7 +939,8 @@@ static int mce_end(int order
                 * loops.
                 */
                while (atomic_read(&mce_executing) <= cpus) {
 -                      if (mce_timed_out(&timeout))
 +                      if (mce_timed_out(&timeout,
 +                                        "Timeout: Monarch CPU unable to finish machine check processing"))
                                goto reset;
                        ndelay(SPINUNIT);
                }
                 * Subject: Wait for Monarch to finish.
                 */
                while (atomic_read(&mce_executing) != 0) {
 -                      if (mce_timed_out(&timeout))
 +                      if (mce_timed_out(&timeout,
 +                                        "Timeout: Monarch CPU did not finish machine check processing"))
                                goto reset;
                        ndelay(SPINUNIT);
                }
@@@ -1452,7 -1450,7 +1453,7 @@@ static void __mcheck_cpu_init_generic(v
        bitmap_fill(all_banks, MAX_NR_BANKS);
        machine_check_poll(MCP_UC | m_fl, &all_banks);
  
-       set_in_cr4(X86_CR4_MCE);
+       cr4_set_bits(X86_CR4_MCE);
  
        rdmsrl(MSR_IA32_MCG_CAP, cap);
        if (cap & MCG_CTL_P)
diff --combined arch/x86/kernel/head64.c
index efcddfaf05f9f94f41b20d1a5a99c851e9b09a3b,3b241f0ca005fcfc9a157d2bdff04ad69f0d34f8..c4f8d4659070db99ce190543186bc4a4ac5d2ac9
@@@ -27,7 -27,6 +27,7 @@@
  #include <asm/bios_ebda.h>
  #include <asm/bootparam_utils.h>
  #include <asm/microcode.h>
 +#include <asm/kasan.h>
  
  /*
   * Manage page tables very early on.
@@@ -47,7 -46,7 +47,7 @@@ static void __init reset_early_page_tab
  
        next_early_pgt = 0;
  
 -      write_cr3(__pa(early_level4_pgt));
 +      write_cr3(__pa_nodebug(early_level4_pgt));
  }
  
  /* Create a new PMD entry */
@@@ -60,7 -59,7 +60,7 @@@ int __init early_make_pgtable(unsigned 
        pmdval_t pmd, *pmd_p;
  
        /* Invalid address or early pgt is done ?  */
 -      if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
 +      if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
                return -1;
  
  again:
@@@ -156,11 -155,11 +156,13 @@@ asmlinkage __visible void __init x86_64
                                (__START_KERNEL & PGDIR_MASK)));
        BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
  
+       cr4_init_shadow();
        /* Kill off the identity-map trampoline */
        reset_early_page_tables();
  
 +      kasan_map_early_shadow(early_level4_pgt);
 +
        /* clear bss before set_intr_gate with early_idt_handler */
        clear_bss();
  
        /* set init_level4_pgt kernel high mapping*/
        init_level4_pgt[511] = early_level4_pgt[511];
  
 +      kasan_map_early_shadow(init_level4_pgt);
 +
        x86_64_start_reservations(real_mode_data);
  }
  
diff --combined arch/x86/kernel/i387.c
index 81049ffab2d601cf67ce6bdf455edb4d65abbc46,87727b03196da51aa6aaf68de607485d262e5fab..d5651fce0b71af6c15226483b6a06398ccbeb8a0
  #include <asm/sigcontext.h>
  #include <asm/processor.h>
  #include <asm/math_emu.h>
+ #include <asm/tlbflush.h>
  #include <asm/uaccess.h>
  #include <asm/ptrace.h>
  #include <asm/i387.h>
  #include <asm/fpu-internal.h>
  #include <asm/user.h>
  
 +static DEFINE_PER_CPU(bool, in_kernel_fpu);
 +
 +void kernel_fpu_disable(void)
 +{
 +      WARN_ON(this_cpu_read(in_kernel_fpu));
 +      this_cpu_write(in_kernel_fpu, true);
 +}
 +
 +void kernel_fpu_enable(void)
 +{
 +      this_cpu_write(in_kernel_fpu, false);
 +}
 +
  /*
   * Were we in an interrupt that interrupted kernel mode?
   *
@@@ -46,9 -34,6 +47,9 @@@
   */
  static inline bool interrupted_kernel_fpu_idle(void)
  {
 +      if (this_cpu_read(in_kernel_fpu))
 +              return false;
 +
        if (use_eager_fpu())
                return __thread_has_fpu(current);
  
@@@ -89,10 -74,10 +90,10 @@@ void __kernel_fpu_begin(void
  {
        struct task_struct *me = current;
  
 +      this_cpu_write(in_kernel_fpu, true);
 +
        if (__thread_has_fpu(me)) {
 -              __thread_clear_has_fpu(me);
                __save_init_fpu(me);
 -              /* We do 'stts()' in __kernel_fpu_end() */
        } else if (!use_eager_fpu()) {
                this_cpu_write(fpu_owner_task, NULL);
                clts();
@@@ -102,16 -87,19 +103,16 @@@ EXPORT_SYMBOL(__kernel_fpu_begin)
  
  void __kernel_fpu_end(void)
  {
 -      if (use_eager_fpu()) {
 -              /*
 -               * For eager fpu, most the time, tsk_used_math() is true.
 -               * Restore the user math as we are done with the kernel usage.
 -               * At few instances during thread exit, signal handling etc,
 -               * tsk_used_math() is false. Those few places will take proper
 -               * actions, so we don't need to restore the math here.
 -               */
 -              if (likely(tsk_used_math(current)))
 -                      math_state_restore();
 -      } else {
 +      struct task_struct *me = current;
 +
 +      if (__thread_has_fpu(me)) {
 +              if (WARN_ON(restore_fpu_checking(me)))
 +                      drop_init_fpu(me);
 +      } else if (!use_eager_fpu()) {
                stts();
        }
 +
 +      this_cpu_write(in_kernel_fpu, false);
  }
  EXPORT_SYMBOL(__kernel_fpu_end);
  
@@@ -193,7 -181,7 +194,7 @@@ void fpu_init(void
        if (cpu_has_xmm)
                cr4_mask |= X86_CR4_OSXMMEXCPT;
        if (cr4_mask)
-               set_in_cr4(cr4_mask);
+               cr4_set_bits(cr4_mask);
  
        cr0 = read_cr0();
        cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --combined arch/x86/kernel/setup.c
index 27d2009298646990d1b4956307ab192b030aa211,04e6c62f1a9386535f9b1e4f5156e37ec14f402d..0a2421cca01fad095bbb7caa8e7c779d910d751b
@@@ -89,7 -89,6 +89,7 @@@
  #include <asm/cacheflush.h>
  #include <asm/processor.h>
  #include <asm/bugs.h>
 +#include <asm/kasan.h>
  
  #include <asm/vsyscall.h>
  #include <asm/cpu.h>
@@@ -432,13 -431,15 +432,13 @@@ static void __init parse_setup_data(voi
  
        pa_data = boot_params.hdr.setup_data;
        while (pa_data) {
 -              u32 data_len, map_len, data_type;
 +              u32 data_len, data_type;
  
 -              map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
 -                            (u64)sizeof(struct setup_data));
 -              data = early_memremap(pa_data, map_len);
 +              data = early_memremap(pa_data, sizeof(*data));
                data_len = data->len + sizeof(struct setup_data);
                data_type = data->type;
                pa_next = data->next;
 -              early_iounmap(data, map_len);
 +              early_iounmap(data, sizeof(*data));
  
                switch (data_type) {
                case SETUP_E820_EXT:
@@@ -1175,11 -1176,9 +1175,11 @@@ void __init setup_arch(char **cmdline_p
  
        x86_init.paging.pagetable_init();
  
 +      kasan_init();
 +
        if (boot_cpu_data.cpuid_level >= 0) {
                /* A CPU has %cr4 if and only if it has CPUID */
-               mmu_cr4_features = read_cr4();
+               mmu_cr4_features = __read_cr4();
                if (trampoline_cr4_features)
                        *trampoline_cr4_features = mmu_cr4_features;
        }
diff --combined arch/x86/kvm/svm.c
index a17d848c6d42d0d9ba292b06ec9df7dd857fe790,496a54839968e4c8389b67c676f2a3577a3bc785..d319e0c24758876178aeab46c65fe611cb02126e
@@@ -1583,7 -1583,7 +1583,7 @@@ static void svm_set_cr0(struct kvm_vcp
  
  static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
-       unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+       unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
        unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
  
        if (cr4 & X86_CR4_VMXE)
@@@ -2003,8 -2003,8 +2003,8 @@@ static void nested_svm_inject_npf_exit(
  
  static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
  {
 -      kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
 -
 +      WARN_ON(mmu_is_nested(vcpu));
 +      kvm_init_shadow_mmu(vcpu);
        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
        vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
diff --combined arch/x86/kvm/vmx.c
index 3f73bfad0349e74042a9315ec20f978a9b5bec85,8dca6ccbb9cefcdb960d8e4ceb62366ef4729ebc..14c1a18d206aeee0d59637162b0f1a58056c8941
@@@ -45,7 -45,6 +45,7 @@@
  #include <asm/perf_event.h>
  #include <asm/debugreg.h>
  #include <asm/kexec.h>
 +#include <asm/apic.h>
  
  #include "trace.h"
  
@@@ -102,9 -101,6 +102,9 @@@ module_param(nested, bool, S_IRUGO)
  
  static u64 __read_mostly host_xss;
  
 +static bool __read_mostly enable_pml = 1;
 +module_param_named(pml, enable_pml, bool, S_IRUGO);
 +
  #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
  #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
  #define KVM_VM_CR0_ALWAYS_ON                                          \
@@@ -219,12 -215,7 +219,12 @@@ struct __packed vmcs12 
        u64 tsc_offset;
        u64 virtual_apic_page_addr;
        u64 apic_access_addr;
 +      u64 posted_intr_desc_addr;
        u64 ept_pointer;
 +      u64 eoi_exit_bitmap0;
 +      u64 eoi_exit_bitmap1;
 +      u64 eoi_exit_bitmap2;
 +      u64 eoi_exit_bitmap3;
        u64 xss_exit_bitmap;
        u64 guest_physical_address;
        u64 vmcs_link_pointer;
        u32 vmx_preemption_timer_value;
        u32 padding32[7]; /* room for future expansion */
        u16 virtual_processor_id;
 +      u16 posted_intr_nv;
        u16 guest_es_selector;
        u16 guest_cs_selector;
        u16 guest_ss_selector;
        u16 guest_gs_selector;
        u16 guest_ldtr_selector;
        u16 guest_tr_selector;
 +      u16 guest_intr_status;
        u16 host_es_selector;
        u16 host_cs_selector;
        u16 host_ss_selector;
@@@ -412,10 -401,6 +412,10 @@@ struct nested_vmx 
         */
        struct page *apic_access_page;
        struct page *virtual_apic_page;
 +      struct page *pi_desc_page;
 +      struct pi_desc *pi_desc;
 +      bool pi_pending;
 +      u16 posted_intr_nv;
        u64 msr_ia32_feature_control;
  
        struct hrtimer preemption_timer;
  
        /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
        u64 vmcs01_debugctl;
 +
 +      u32 nested_vmx_procbased_ctls_low;
 +      u32 nested_vmx_procbased_ctls_high;
 +      u32 nested_vmx_true_procbased_ctls_low;
 +      u32 nested_vmx_secondary_ctls_low;
 +      u32 nested_vmx_secondary_ctls_high;
 +      u32 nested_vmx_pinbased_ctls_low;
 +      u32 nested_vmx_pinbased_ctls_high;
 +      u32 nested_vmx_exit_ctls_low;
 +      u32 nested_vmx_exit_ctls_high;
 +      u32 nested_vmx_true_exit_ctls_low;
 +      u32 nested_vmx_entry_ctls_low;
 +      u32 nested_vmx_entry_ctls_high;
 +      u32 nested_vmx_true_entry_ctls_low;
 +      u32 nested_vmx_misc_low;
 +      u32 nested_vmx_misc_high;
 +      u32 nested_vmx_ept_caps;
  };
  
  #define POSTED_INTR_ON  0
@@@ -543,10 -511,6 +543,10 @@@ struct vcpu_vmx 
        /* Dynamic PLE window. */
        int ple_window;
        bool ple_window_dirty;
 +
 +      /* Support for PML */
 +#define PML_ENTITY_NUM                512
 +      struct page *pml_pg;
  };
  
  enum segment_cache_field {
@@@ -630,7 -594,6 +630,7 @@@ static int max_shadow_read_write_field
  
  static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
 +      FIELD(POSTED_INTR_NV, posted_intr_nv),
        FIELD(GUEST_ES_SELECTOR, guest_es_selector),
        FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
        FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
        FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
        FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
        FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
 +      FIELD(GUEST_INTR_STATUS, guest_intr_status),
        FIELD(HOST_ES_SELECTOR, host_es_selector),
        FIELD(HOST_CS_SELECTOR, host_cs_selector),
        FIELD(HOST_SS_SELECTOR, host_ss_selector),
        FIELD64(TSC_OFFSET, tsc_offset),
        FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
        FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
 +      FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
        FIELD64(EPT_POINTER, ept_pointer),
 +      FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
 +      FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
 +      FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
 +      FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
        FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
        FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
        FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@@ -809,7 -766,6 +809,7 @@@ static void kvm_cpu_vmxon(u64 addr)
  static void kvm_cpu_vmxoff(void);
  static bool vmx_mpx_supported(void);
  static bool vmx_xsaves_supported(void);
 +static int vmx_vm_has_apicv(struct kvm *kvm);
  static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
  static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@@ -837,7 -793,6 +837,7 @@@ static unsigned long *vmx_msr_bitmap_le
  static unsigned long *vmx_msr_bitmap_longmode;
  static unsigned long *vmx_msr_bitmap_legacy_x2apic;
  static unsigned long *vmx_msr_bitmap_longmode_x2apic;
 +static unsigned long *vmx_msr_bitmap_nested;
  static unsigned long *vmx_vmread_bitmap;
  static unsigned long *vmx_vmwrite_bitmap;
  
@@@ -1004,6 -959,16 +1004,6 @@@ static inline bool cpu_has_vmx_ept_exec
        return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
  }
  
 -static inline bool cpu_has_vmx_eptp_uncacheable(void)
 -{
 -      return vmx_capability.ept & VMX_EPTP_UC_BIT;
 -}
 -
 -static inline bool cpu_has_vmx_eptp_writeback(void)
 -{
 -      return vmx_capability.ept & VMX_EPTP_WB_BIT;
 -}
 -
  static inline bool cpu_has_vmx_ept_2m_page(void)
  {
        return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
@@@ -1108,11 -1073,6 +1108,11 @@@ static inline bool cpu_has_vmx_shadow_v
                SECONDARY_EXEC_SHADOW_VMCS;
  }
  
 +static inline bool cpu_has_vmx_pml(void)
 +{
 +      return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
 +}
 +
  static inline bool report_flexpriority(void)
  {
        return flexpriority_enabled;
@@@ -1152,26 -1112,6 +1152,26 @@@ static inline bool nested_cpu_has_xsave
                vmx_xsaves_supported();
  }
  
 +static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
 +{
 +      return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 +}
 +
 +static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
 +{
 +      return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
 +}
 +
 +static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
 +{
 +      return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 +}
 +
 +static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
 +{
 +      return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
 +}
 +
  static inline bool is_exception(u32 intr_info)
  {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@@ -2344,8 -2284,20 +2344,8 @@@ static inline bool nested_vmx_allowed(s
   * if the corresponding bit in the (32-bit) control field *must* be on, and a
   * bit in the high half is on if the corresponding bit in the control field
   * may be on. See also vmx_control_verify().
 - * TODO: allow these variables to be modified (downgraded) by module options
 - * or other means.
   */
 -static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
 -static u32 nested_vmx_true_procbased_ctls_low;
 -static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
 -static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 -static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 -static u32 nested_vmx_true_exit_ctls_low;
 -static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 -static u32 nested_vmx_true_entry_ctls_low;
 -static u32 nested_vmx_misc_low, nested_vmx_misc_high;
 -static u32 nested_vmx_ept_caps;
 -static __init void nested_vmx_setup_ctls_msrs(void)
 +static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
  {
        /*
         * Note that as a general rule, the high half of the MSRs (bits in
  
        /* pin-based controls */
        rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
 -            nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
 -      nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 -      nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
 -              PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
 -      nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 +              vmx->nested.nested_vmx_pinbased_ctls_low,
 +              vmx->nested.nested_vmx_pinbased_ctls_high);
 +      vmx->nested.nested_vmx_pinbased_ctls_low |=
 +              PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 +      vmx->nested.nested_vmx_pinbased_ctls_high &=
 +              PIN_BASED_EXT_INTR_MASK |
 +              PIN_BASED_NMI_EXITING |
 +              PIN_BASED_VIRTUAL_NMIS;
 +      vmx->nested.nested_vmx_pinbased_ctls_high |=
 +              PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                PIN_BASED_VMX_PREEMPTION_TIMER;
 +      if (vmx_vm_has_apicv(vmx->vcpu.kvm))
 +              vmx->nested.nested_vmx_pinbased_ctls_high |=
 +                      PIN_BASED_POSTED_INTR;
  
        /* exit controls */
        rdmsr(MSR_IA32_VMX_EXIT_CTLS,
 -              nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
 -      nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 +              vmx->nested.nested_vmx_exit_ctls_low,
 +              vmx->nested.nested_vmx_exit_ctls_high);
 +      vmx->nested.nested_vmx_exit_ctls_low =
 +              VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  
 -      nested_vmx_exit_ctls_high &=
 +      vmx->nested.nested_vmx_exit_ctls_high &=
  #ifdef CONFIG_X86_64
                VM_EXIT_HOST_ADDR_SPACE_SIZE |
  #endif
                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
 -      nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 +      vmx->nested.nested_vmx_exit_ctls_high |=
 +              VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
  
        if (vmx_mpx_supported())
 -              nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 +              vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
  
        /* We support free control of debug control saving. */
 -      nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
 +      vmx->nested.nested_vmx_true_exit_ctls_low =
 +              vmx->nested.nested_vmx_exit_ctls_low &
                ~VM_EXIT_SAVE_DEBUG_CONTROLS;
  
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
 -              nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
 -      nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 -      nested_vmx_entry_ctls_high &=
 +              vmx->nested.nested_vmx_entry_ctls_low,
 +              vmx->nested.nested_vmx_entry_ctls_high);
 +      vmx->nested.nested_vmx_entry_ctls_low =
 +              VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 +      vmx->nested.nested_vmx_entry_ctls_high &=
  #ifdef CONFIG_X86_64
                VM_ENTRY_IA32E_MODE |
  #endif
                VM_ENTRY_LOAD_IA32_PAT;
 -      nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
 -                                     VM_ENTRY_LOAD_IA32_EFER);
 +      vmx->nested.nested_vmx_entry_ctls_high |=
 +              (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
        if (vmx_mpx_supported())
 -              nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 +              vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
  
        /* We support free control of debug control loading. */
 -      nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
 +      vmx->nested.nested_vmx_true_entry_ctls_low =
 +              vmx->nested.nested_vmx_entry_ctls_low &
                ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
  
        /* cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
 -              nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
 -      nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 -      nested_vmx_procbased_ctls_high &=
 +              vmx->nested.nested_vmx_procbased_ctls_low,
 +              vmx->nested.nested_vmx_procbased_ctls_high);
 +      vmx->nested.nested_vmx_procbased_ctls_low =
 +              CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 +      vmx->nested.nested_vmx_procbased_ctls_high &=
                CPU_BASED_VIRTUAL_INTR_PENDING |
                CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
                CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
         * can use it to avoid exits to L1 - even when L0 runs L2
         * without MSR bitmaps.
         */
 -      nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 +      vmx->nested.nested_vmx_procbased_ctls_high |=
 +              CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                CPU_BASED_USE_MSR_BITMAPS;
  
        /* We support free control of CR3 access interception. */
 -      nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
 +      vmx->nested.nested_vmx_true_procbased_ctls_low =
 +              vmx->nested.nested_vmx_procbased_ctls_low &
                ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
  
        /* secondary cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
 -              nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
 -      nested_vmx_secondary_ctls_low = 0;
 -      nested_vmx_secondary_ctls_high &=
 +              vmx->nested.nested_vmx_secondary_ctls_low,
 +              vmx->nested.nested_vmx_secondary_ctls_high);
 +      vmx->nested.nested_vmx_secondary_ctls_low = 0;
 +      vmx->nested.nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 +              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 +              SECONDARY_EXEC_APIC_REGISTER_VIRT |
 +              SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                SECONDARY_EXEC_WBINVD_EXITING |
                SECONDARY_EXEC_XSAVES;
  
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
 -              nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
 +              vmx->nested.nested_vmx_secondary_ctls_high |=
 +                      SECONDARY_EXEC_ENABLE_EPT |
                        SECONDARY_EXEC_UNRESTRICTED_GUEST;
 -              nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
 +              vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                         VMX_EPT_INVEPT_BIT;
 -              nested_vmx_ept_caps &= vmx_capability.ept;
 +              vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
                /*
                 * For nested guests, we don't do anything specific
                 * for single context invalidation. Hence, only advertise
                 * support for global context invalidation.
                 */
 -              nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
 +              vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
        } else
 -              nested_vmx_ept_caps = 0;
 +              vmx->nested.nested_vmx_ept_caps = 0;
  
        /* miscellaneous data */
 -      rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
 -      nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
 -      nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
 +      rdmsr(MSR_IA32_VMX_MISC,
 +              vmx->nested.nested_vmx_misc_low,
 +              vmx->nested.nested_vmx_misc_high);
 +      vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
 +      vmx->nested.nested_vmx_misc_low |=
 +              VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
                VMX_MISC_ACTIVITY_HLT;
 -      nested_vmx_misc_high = 0;
 +      vmx->nested.nested_vmx_misc_high = 0;
  }
  
  static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@@ -2518,8 -2443,6 +2518,8 @@@ static inline u64 vmx_control_msr(u32 l
  /* Returns 0 on success, non-0 otherwise. */
  static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
  {
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
 +
        switch (msr_index) {
        case MSR_IA32_VMX_BASIC:
                /*
                break;
        case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
        case MSR_IA32_VMX_PINBASED_CTLS:
 -              *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
 -                                      nested_vmx_pinbased_ctls_high);
 +              *pdata = vmx_control_msr(
 +                      vmx->nested.nested_vmx_pinbased_ctls_low,
 +                      vmx->nested.nested_vmx_pinbased_ctls_high);
                break;
        case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
 -              *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
 -                                      nested_vmx_procbased_ctls_high);
 +              *pdata = vmx_control_msr(
 +                      vmx->nested.nested_vmx_true_procbased_ctls_low,
 +                      vmx->nested.nested_vmx_procbased_ctls_high);
                break;
        case MSR_IA32_VMX_PROCBASED_CTLS:
 -              *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
 -                                      nested_vmx_procbased_ctls_high);
 +              *pdata = vmx_control_msr(
 +                      vmx->nested.nested_vmx_procbased_ctls_low,
 +                      vmx->nested.nested_vmx_procbased_ctls_high);
                break;
        case MSR_IA32_VMX_TRUE_EXIT_CTLS:
 -              *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
 -                                      nested_vmx_exit_ctls_high);
 +              *pdata = vmx_control_msr(
 +                      vmx->nested.nested_vmx_true_exit_ctls_low,
 +                      vmx->nested.nested_vmx_exit_ctls_high);
                break;
        case MSR_IA32_VMX_EXIT_CTLS:
 -              *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
 -                                      nested_vmx_exit_ctls_high);
 +              *pdata = vmx_control_msr(
 +                      vmx->nested.nested_vmx_exit_ctls_low,
 +                      vmx->nested.nested_vmx_exit_ctls_high);
                break;
        case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
 -              *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
 -                                      nested_vmx_entry_ctls_high);
 +              *pdata = vmx_control_msr(
 +                      vmx->nested.nested_vmx_true_entry_ctls_low,
 +                      vmx->nested.nested_vmx_entry_ctls_high);
                break;
        case MSR_IA32_VMX_ENTRY_CTLS:
 -              *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
 -                                      nested_vmx_entry_ctls_high);
 +              *pdata = vmx_control_msr(
 +                      vmx->nested.nested_vmx_entry_ctls_low,
 +                      vmx->nested.nested_vmx_entry_ctls_high);
                break;
        case MSR_IA32_VMX_MISC:
 -              *pdata = vmx_control_msr(nested_vmx_misc_low,
 -                                       nested_vmx_misc_high);
 +              *pdata = vmx_control_msr(
 +                      vmx->nested.nested_vmx_misc_low,
 +                      vmx->nested.nested_vmx_misc_high);
                break;
        /*
         * These MSRs specify bits which the guest must keep fixed (on or off)
                *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
                break;
        case MSR_IA32_VMX_PROCBASED_CTLS2:
 -              *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
 -                                      nested_vmx_secondary_ctls_high);
 +              *pdata = vmx_control_msr(
 +                      vmx->nested.nested_vmx_secondary_ctls_low,
 +                      vmx->nested.nested_vmx_secondary_ctls_high);
                break;
        case MSR_IA32_VMX_EPT_VPID_CAP:
                /* Currently, no nested vpid support */
 -              *pdata = nested_vmx_ept_caps;
 +              *pdata = vmx->nested.nested_vmx_ept_caps;
                break;
        default:
                return 1;
@@@ -2871,7 -2785,7 +2871,7 @@@ static int hardware_enable(void
        u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
        u64 old, test_bits;
  
-       if (read_cr4() & X86_CR4_VMXE)
+       if (cr4_read_shadow() & X86_CR4_VMXE)
                return -EBUSY;
  
        INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
                /* enable and lock */
                wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
        }
-       write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+       cr4_set_bits(X86_CR4_VMXE);
  
        if (vmm_exclusive) {
                kvm_cpu_vmxon(phys_addr);
@@@ -2935,7 -2849,7 +2935,7 @@@ static void hardware_disable(void
                vmclear_local_loaded_vmcss();
                kvm_cpu_vmxoff();
        }
-       write_cr4(read_cr4() & ~X86_CR4_VMXE);
+       cr4_clear_bits(X86_CR4_VMXE);
  }
  
  static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@@ -3015,8 -2929,7 +3015,8 @@@ static __init int setup_vmcs_config(str
                        SECONDARY_EXEC_APIC_REGISTER_VIRT |
                        SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                        SECONDARY_EXEC_SHADOW_VMCS |
 -                      SECONDARY_EXEC_XSAVES;
 +                      SECONDARY_EXEC_XSAVES |
 +                      SECONDARY_EXEC_ENABLE_PML;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@@ -4246,52 -4159,6 +4246,52 @@@ static void __vmx_enable_intercept_for_
        }
  }
  
 +/*
 + * If a msr is allowed by L0, we should check whether it is allowed by L1.
 + * The corresponding bit will be cleared unless both of L0 and L1 allow it.
 + */
 +static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
 +                                             unsigned long *msr_bitmap_nested,
 +                                             u32 msr, int type)
 +{
 +      int f = sizeof(unsigned long);
 +
 +      if (!cpu_has_vmx_msr_bitmap()) {
 +              WARN_ON(1);
 +              return;
 +      }
 +
 +      /*
 +       * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
 +       * have the write-low and read-high bitmap offsets the wrong way round.
 +       * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 +       */
 +      if (msr <= 0x1fff) {
 +              if (type & MSR_TYPE_R &&
 +                 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
 +                      /* read-low */
 +                      __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
 +
 +              if (type & MSR_TYPE_W &&
 +                 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
 +                      /* write-low */
 +                      __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
 +
 +      } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 +              msr &= 0x1fff;
 +              if (type & MSR_TYPE_R &&
 +                 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
 +                      /* read-high */
 +                      __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
 +
 +              if (type & MSR_TYPE_W &&
 +                 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
 +                      /* write-high */
 +                      __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
 +
 +      }
 +}
 +
  static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
  {
        if (!longmode_only)
@@@ -4330,64 -4197,6 +4330,64 @@@ static int vmx_vm_has_apicv(struct kvm 
        return enable_apicv && irqchip_in_kernel(kvm);
  }
  
 +static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 +{
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
 +      int max_irr;
 +      void *vapic_page;
 +      u16 status;
 +
 +      if (vmx->nested.pi_desc &&
 +          vmx->nested.pi_pending) {
 +              vmx->nested.pi_pending = false;
 +              if (!pi_test_and_clear_on(vmx->nested.pi_desc))
 +                      return 0;
 +
 +              max_irr = find_last_bit(
 +                      (unsigned long *)vmx->nested.pi_desc->pir, 256);
 +
 +              if (max_irr == 256)
 +                      return 0;
 +
 +              vapic_page = kmap(vmx->nested.virtual_apic_page);
 +              if (!vapic_page) {
 +                      WARN_ON(1);
 +                      return -ENOMEM;
 +              }
 +              __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
 +              kunmap(vmx->nested.virtual_apic_page);
 +
 +              status = vmcs_read16(GUEST_INTR_STATUS);
 +              if ((u8)max_irr > ((u8)status & 0xff)) {
 +                      status &= ~0xff;
 +                      status |= (u8)max_irr;
 +                      vmcs_write16(GUEST_INTR_STATUS, status);
 +              }
 +      }
 +      return 0;
 +}
 +
 +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
 +                                              int vector)
 +{
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
 +
 +      if (is_guest_mode(vcpu) &&
 +          vector == vmx->nested.posted_intr_nv) {
 +              /* the PIR and ON have been set by L1. */
 +              if (vcpu->mode == IN_GUEST_MODE)
 +                      apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
 +                              POSTED_INTR_VECTOR);
 +              /*
 +               * If a posted intr is not recognized by hardware,
 +               * we will accomplish it in the next vmentry.
 +               */
 +              vmx->nested.pi_pending = true;
 +              kvm_make_request(KVM_REQ_EVENT, vcpu);
 +              return 0;
 +      }
 +      return -1;
 +}
  /*
   * Send interrupt to vcpu via posted interrupt way.
   * 1. If target vcpu is running(non-root mode), send posted interrupt
@@@ -4400,10 -4209,6 +4400,10 @@@ static void vmx_deliver_posted_interrup
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int r;
  
 +      r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
 +      if (!r)
 +              return;
 +
        if (pi_test_and_set_pir(vector, &vmx->pi_desc))
                return;
  
@@@ -4450,7 -4255,7 +4450,7 @@@ static void vmx_set_constant_host_state
        vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
  
        /* Save the most likely value for this task's CR4 in the VMCS. */
-       cr4 = read_cr4();
+       cr4 = cr4_read_shadow();
        vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
        vmx->host_state.vmcs_host_cr4 = cr4;
  
@@@ -4555,9 -4360,6 +4555,9 @@@ static u32 vmx_secondary_exec_control(s
           a current VMCS12
        */
        exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
 +      /* PML is enabled/disabled in creating/destorying vcpu */
 +      exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 +
        return exec_control;
  }
  
@@@ -5184,12 -4986,11 +5184,12 @@@ vmx_patch_hypercall(struct kvm_vcpu *vc
        hypercall[2] = 0xc1;
  }
  
 -static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
 +static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
  {
        unsigned long always_on = VMXON_CR0_ALWAYSON;
 +      struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
 -      if (nested_vmx_secondary_ctls_high &
 +      if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
                SECONDARY_EXEC_UNRESTRICTED_GUEST &&
            nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
                always_on &= ~(X86_CR0_PE | X86_CR0_PG);
@@@ -5214,7 -5015,7 +5214,7 @@@ static int handle_set_cr0(struct kvm_vc
                val = (val & ~vmcs12->cr0_guest_host_mask) |
                        (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
  
 -              if (!nested_cr0_valid(vmcs12, val))
 +              if (!nested_cr0_valid(vcpu, val))
                        return 1;
  
                if (kvm_set_cr0(vcpu, val))
@@@ -6016,21 -5817,13 +6016,21 @@@ static __init int hardware_setup(void
                                (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_msr_bitmap_longmode_x2apic)
                goto out4;
 +
 +      if (nested) {
 +              vmx_msr_bitmap_nested =
 +                      (unsigned long *)__get_free_page(GFP_KERNEL);
 +              if (!vmx_msr_bitmap_nested)
 +                      goto out5;
 +      }
 +
        vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmread_bitmap)
 -              goto out5;
 +              goto out6;
  
        vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_vmwrite_bitmap)
 -              goto out6;
 +              goto out7;
  
        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
  
        memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
        memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
 +      if (nested)
 +              memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
  
        if (setup_vmcs_config(&vmcs_config) < 0) {
                r = -EIO;
 -              goto out7;
 +              goto out8;
        }
  
        if (boot_cpu_has(X86_FEATURE_NX))
        if (!cpu_has_vmx_unrestricted_guest())
                enable_unrestricted_guest = 0;
  
 -      if (!cpu_has_vmx_flexpriority()) {
 +      if (!cpu_has_vmx_flexpriority())
                flexpriority_enabled = 0;
  
 -              /*
 -               * set_apic_access_page_addr() is used to reload apic access
 -               * page upon invalidation.  No need to do anything if the
 -               * processor does not have the APIC_ACCESS_ADDR VMCS field.
 -               */
 +      /*
 +       * set_apic_access_page_addr() is used to reload apic access
 +       * page upon invalidation.  No need to do anything if not
 +       * using the APIC_ACCESS_ADDR VMCS field.
 +       */
 +      if (!flexpriority_enabled)
                kvm_x86_ops->set_apic_access_page_addr = NULL;
 -      }
  
        if (!cpu_has_vmx_tpr_shadow())
                kvm_x86_ops->update_cr8_intercept = NULL;
                kvm_x86_ops->update_cr8_intercept = NULL;
        else {
                kvm_x86_ops->hwapic_irr_update = NULL;
 +              kvm_x86_ops->hwapic_isr_update = NULL;
                kvm_x86_ops->deliver_posted_interrupt = NULL;
                kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
        }
  
 -      if (nested)
 -              nested_vmx_setup_ctls_msrs();
 -
        vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
        vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
        vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
  
        update_ple_window_actual_max();
  
 +      /*
 +       * Only enable PML when hardware supports PML feature, and both EPT
 +       * and EPT A/D bit features are enabled -- PML depends on them to work.
 +       */
 +      if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
 +              enable_pml = 0;
 +
 +      if (!enable_pml) {
 +              kvm_x86_ops->slot_enable_log_dirty = NULL;
 +              kvm_x86_ops->slot_disable_log_dirty = NULL;
 +              kvm_x86_ops->flush_log_dirty = NULL;
 +              kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
 +      }
 +
        return alloc_kvm_area();
  
 -out7:
 +out8:
        free_page((unsigned long)vmx_vmwrite_bitmap);
 -out6:
 +out7:
        free_page((unsigned long)vmx_vmread_bitmap);
 +out6:
 +      if (nested)
 +              free_page((unsigned long)vmx_msr_bitmap_nested);
  out5:
        free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
  out4:
@@@ -6201,8 -5977,6 +6201,8 @@@ static __exit void hardware_unsetup(voi
        free_page((unsigned long)vmx_io_bitmap_a);
        free_page((unsigned long)vmx_vmwrite_bitmap);
        free_page((unsigned long)vmx_vmread_bitmap);
 +      if (nested)
 +              free_page((unsigned long)vmx_msr_bitmap_nested);
  
        free_kvm_area();
  }
@@@ -6369,13 -6143,6 +6369,13 @@@ static void nested_vmx_failValid(struc
         */
  }
  
 +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
 +{
 +      /* TODO: not to reset guest simply here. */
 +      kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 +      pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
 +}
 +
  static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
  {
        struct vcpu_vmx *vmx =
@@@ -6665,7 -6432,6 +6665,7 @@@ static inline void nested_release_vmcs1
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
                vmcs_write64(VMCS_LINK_POINTER, -1ull);
        }
 +      vmx->nested.posted_intr_nv = -1;
        kunmap(vmx->nested.current_vmcs12_page);
        nested_release_page(vmx->nested.current_vmcs12_page);
        vmx->nested.current_vmptr = -1ull;
@@@ -6694,12 -6460,6 +6694,12 @@@ static void free_nested(struct vcpu_vm
                nested_release_page(vmx->nested.virtual_apic_page);
                vmx->nested.virtual_apic_page = NULL;
        }
 +      if (vmx->nested.pi_desc_page) {
 +              kunmap(vmx->nested.pi_desc_page);
 +              nested_release_page(vmx->nested.pi_desc_page);
 +              vmx->nested.pi_desc_page = NULL;
 +              vmx->nested.pi_desc = NULL;
 +      }
  
        nested_free_all_saved_vmcss(vmx);
  }
@@@ -7133,7 -6893,6 +7133,7 @@@ static int handle_vmptrst(struct kvm_vc
  /* Emulate the INVEPT instruction */
  static int handle_invept(struct kvm_vcpu *vcpu)
  {
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 vmx_instruction_info, types;
        unsigned long type;
        gva_t gva;
                u64 eptp, gpa;
        } operand;
  
 -      if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
 -          !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
 +      if (!(vmx->nested.nested_vmx_secondary_ctls_high &
 +            SECONDARY_EXEC_ENABLE_EPT) ||
 +          !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
        vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
        type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
  
 -      types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
 +      types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
  
        if (!(types & (1UL << type))) {
                nested_vmx_failValid(vcpu,
@@@ -7202,31 -6960,6 +7202,31 @@@ static int handle_invvpid(struct kvm_vc
        return 1;
  }
  
 +static int handle_pml_full(struct kvm_vcpu *vcpu)
 +{
 +      unsigned long exit_qualification;
 +
 +      trace_kvm_pml_full(vcpu->vcpu_id);
 +
 +      exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 +
 +      /*
 +       * PML buffer FULL happened while executing iret from NMI,
 +       * "blocked by NMI" bit has to be set before next VM entry.
 +       */
 +      if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
 +                      cpu_has_virtual_nmis() &&
 +                      (exit_qualification & INTR_INFO_UNBLOCK_NMI))
 +              vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 +                              GUEST_INTR_STATE_NMI);
 +
 +      /*
 +       * PML buffer already flushed at beginning of VMEXIT. Nothing to do
 +       * here.., and there's no userspace involvement needed for PML.
 +       */
 +      return 1;
 +}
 +
  /*
   * The exit handlers return 1 if the exit was handled fully and guest execution
   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@@ -7275,7 -7008,6 +7275,7 @@@ static int (*const kvm_vmx_exit_handler
        [EXIT_REASON_INVVPID]                 = handle_invvpid,
        [EXIT_REASON_XSAVES]                  = handle_xsaves,
        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
 +      [EXIT_REASON_PML_FULL]                = handle_pml_full,
  };
  
  static const int kvm_vmx_max_exit_handlers =
@@@ -7543,10 -7275,6 +7543,10 @@@ static bool nested_vmx_exit_handled(str
        case EXIT_REASON_APIC_ACCESS:
                return nested_cpu_has2(vmcs12,
                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 +      case EXIT_REASON_APIC_WRITE:
 +      case EXIT_REASON_EOI_INDUCED:
 +              /* apic_write and eoi_induced should exit unconditionally. */
 +              return 1;
        case EXIT_REASON_EPT_VIOLATION:
                /*
                 * L0 always deals with the EPT violation. If nested EPT is
@@@ -7586,89 -7314,6 +7586,89 @@@ static void vmx_get_exit_info(struct kv
        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
  }
  
 +static int vmx_enable_pml(struct vcpu_vmx *vmx)
 +{
 +      struct page *pml_pg;
 +      u32 exec_control;
 +
 +      pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
 +      if (!pml_pg)
 +              return -ENOMEM;
 +
 +      vmx->pml_pg = pml_pg;
 +
 +      vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 +      vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 +
 +      exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
 +      exec_control |= SECONDARY_EXEC_ENABLE_PML;
 +      vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 +
 +      return 0;
 +}
 +
 +static void vmx_disable_pml(struct vcpu_vmx *vmx)
 +{
 +      u32 exec_control;
 +
 +      ASSERT(vmx->pml_pg);
 +      __free_page(vmx->pml_pg);
 +      vmx->pml_pg = NULL;
 +
 +      exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
 +      exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 +      vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 +}
 +
 +static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
 +{
 +      struct kvm *kvm = vmx->vcpu.kvm;
 +      u64 *pml_buf;
 +      u16 pml_idx;
 +
 +      pml_idx = vmcs_read16(GUEST_PML_INDEX);
 +
 +      /* Do nothing if PML buffer is empty */
 +      if (pml_idx == (PML_ENTITY_NUM - 1))
 +              return;
 +
 +      /* PML index always points to next available PML buffer entity */
 +      if (pml_idx >= PML_ENTITY_NUM)
 +              pml_idx = 0;
 +      else
 +              pml_idx++;
 +
 +      pml_buf = page_address(vmx->pml_pg);
 +      for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
 +              u64 gpa;
 +
 +              gpa = pml_buf[pml_idx];
 +              WARN_ON(gpa & (PAGE_SIZE - 1));
 +              mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
 +      }
 +
 +      /* reset PML index */
 +      vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 +}
 +
 +/*
 + * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
 + * Called before reporting dirty_bitmap to userspace.
 + */
 +static void kvm_flush_pml_buffers(struct kvm *kvm)
 +{
 +      int i;
 +      struct kvm_vcpu *vcpu;
 +      /*
 +       * We only need to kick vcpu out of guest mode here, as PML buffer
 +       * is flushed at beginning of all VMEXITs, and it's obvious that only
 +       * vcpus running in guest are possible to have unflushed GPAs in PML
 +       * buffer.
 +       */
 +      kvm_for_each_vcpu(i, vcpu, kvm)
 +              kvm_vcpu_kick(vcpu);
 +}
 +
  /*
   * The guest has exited.  See if we can fix it or if we need userspace
   * assistance.
@@@ -7679,16 -7324,6 +7679,16 @@@ static int vmx_handle_exit(struct kvm_v
        u32 exit_reason = vmx->exit_reason;
        u32 vectoring_info = vmx->idt_vectoring_info;
  
 +      /*
 +       * Flush logged GPAs PML buffer, this will make dirty_bitmap more
 +       * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
 +       * querying dirty_bitmap, we only need to kick all vcpus out of guest
 +       * mode as if vcpus is in root mode, the PML buffer must has been
 +       * flushed already.
 +       */
 +      if (enable_pml)
 +              vmx_flush_pml_buffer(vmx);
 +
        /* If guest state is invalid, start emulating */
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
@@@ -7836,6 -7471,9 +7836,6 @@@ static void vmx_hwapic_isr_update(struc
        u16 status;
        u8 old;
  
 -      if (!vmx_vm_has_apicv(kvm))
 -              return;
 -
        if (isr == -1)
                isr = 0;
  
@@@ -8146,7 -7784,7 +8146,7 @@@ static void __noclone vmx_vcpu_run(stru
        if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
  
-       cr4 = read_cr4();
+       cr4 = cr4_read_shadow();
        if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
                vmcs_writel(HOST_CR4, cr4);
                vmx->host_state.vmcs_host_cr4 = cr4;
@@@ -8335,8 -7973,6 +8335,8 @@@ static void vmx_free_vcpu(struct kvm_vc
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
  
 +      if (enable_pml)
 +              vmx_disable_pml(vmx);
        free_vpid(vmx);
        leave_guest_mode(vcpu);
        vmx_load_vmcs01(vcpu);
@@@ -8404,25 -8040,9 +8404,25 @@@ static struct kvm_vcpu *vmx_create_vcpu
                        goto free_vmcs;
        }
  
 +      if (nested)
 +              nested_vmx_setup_ctls_msrs(vmx);
 +
 +      vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
        vmx->nested.current_vmcs12 = NULL;
  
 +      /*
 +       * If PML is turned on, failure on enabling PML just results in failure
 +       * of creating the vcpu, therefore we can simplify PML logic (by
 +       * avoiding dealing with cases, such as enabling PML partially on vcpus
 +       * for the guest, etc.
 +       */
 +      if (enable_pml) {
 +              err = vmx_enable_pml(vmx);
 +              if (err)
 +                      goto free_vmcs;
 +      }
 +
        return &vmx->vcpu;
  
  free_vmcs:
@@@ -8564,10 -8184,9 +8564,10 @@@ static unsigned long nested_ept_get_cr3
  
  static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
  {
 -      kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
 -                      nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
 -
 +      WARN_ON(mmu_is_nested(vcpu));
 +      kvm_init_shadow_ept_mmu(vcpu,
 +                      to_vmx(vcpu)->nested.nested_vmx_ept_caps &
 +                      VMX_EPT_EXECUTE_ONLY_BIT);
        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
        vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@@ -8580,18 -8199,6 +8580,18 @@@ static void nested_ept_uninit_mmu_conte
        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
  }
  
 +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
 +                                          u16 error_code)
 +{
 +      bool inequality, bit;
 +
 +      bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
 +      inequality =
 +              (error_code & vmcs12->page_fault_error_code_mask) !=
 +               vmcs12->page_fault_error_code_match;
 +      return inequality ^ bit;
 +}
 +
  static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
                struct x86_exception *fault)
  {
  
        WARN_ON(!is_guest_mode(vcpu));
  
 -      /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
 -      if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
 +      if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
                nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
                                  vmcs_read32(VM_EXIT_INTR_INFO),
                                  vmcs_readl(EXIT_QUALIFICATION));
@@@ -8653,31 -8261,6 +8653,31 @@@ static bool nested_get_vmcs12_pages(str
                        return false;
        }
  
 +      if (nested_cpu_has_posted_intr(vmcs12)) {
 +              if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
 +                      return false;
 +
 +              if (vmx->nested.pi_desc_page) { /* shouldn't happen */
 +                      kunmap(vmx->nested.pi_desc_page);
 +                      nested_release_page(vmx->nested.pi_desc_page);
 +              }
 +              vmx->nested.pi_desc_page =
 +                      nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
 +              if (!vmx->nested.pi_desc_page)
 +                      return false;
 +
 +              vmx->nested.pi_desc =
 +                      (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
 +              if (!vmx->nested.pi_desc) {
 +                      nested_release_page_clean(vmx->nested.pi_desc_page);
 +                      return false;
 +              }
 +              vmx->nested.pi_desc =
 +                      (struct pi_desc *)((void *)vmx->nested.pi_desc +
 +                      (unsigned long)(vmcs12->posted_intr_desc_addr &
 +                      (PAGE_SIZE - 1)));
 +      }
 +
        return true;
  }
  
@@@ -8703,310 -8286,6 +8703,310 @@@ static void vmx_start_preemption_timer(
                      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
  }
  
 +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
 +                                              struct vmcs12 *vmcs12)
 +{
 +      int maxphyaddr;
 +      u64 addr;
 +
 +      if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
 +              return 0;
 +
 +      if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
 +              WARN_ON(1);
 +              return -EINVAL;
 +      }
 +      maxphyaddr = cpuid_maxphyaddr(vcpu);
 +
 +      if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
 +         ((addr + PAGE_SIZE) >> maxphyaddr))
 +              return -EINVAL;
 +
 +      return 0;
 +}
 +
 +/*
 + * Merge L0's and L1's MSR bitmap, return false to indicate that
 + * we do not use the hardware.
 + */
 +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 +                                             struct vmcs12 *vmcs12)
 +{
 +      int msr;
 +      struct page *page;
 +      unsigned long *msr_bitmap;
 +
 +      if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
 +              return false;
 +
 +      page = nested_get_page(vcpu, vmcs12->msr_bitmap);
 +      if (!page) {
 +              WARN_ON(1);
 +              return false;
 +      }
 +      msr_bitmap = (unsigned long *)kmap(page);
 +      if (!msr_bitmap) {
 +              nested_release_page_clean(page);
 +              WARN_ON(1);
 +              return false;
 +      }
 +
 +      if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
 +              if (nested_cpu_has_apic_reg_virt(vmcs12))
 +                      for (msr = 0x800; msr <= 0x8ff; msr++)
 +                              nested_vmx_disable_intercept_for_msr(
 +                                      msr_bitmap,
 +                                      vmx_msr_bitmap_nested,
 +                                      msr, MSR_TYPE_R);
 +              /* TPR is allowed */
 +              nested_vmx_disable_intercept_for_msr(msr_bitmap,
 +                              vmx_msr_bitmap_nested,
 +                              APIC_BASE_MSR + (APIC_TASKPRI >> 4),
 +                              MSR_TYPE_R | MSR_TYPE_W);
 +              if (nested_cpu_has_vid(vmcs12)) {
 +                      /* EOI and self-IPI are allowed */
 +                      nested_vmx_disable_intercept_for_msr(
 +                              msr_bitmap,
 +                              vmx_msr_bitmap_nested,
 +                              APIC_BASE_MSR + (APIC_EOI >> 4),
 +                              MSR_TYPE_W);
 +                      nested_vmx_disable_intercept_for_msr(
 +                              msr_bitmap,
 +                              vmx_msr_bitmap_nested,
 +                              APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
 +                              MSR_TYPE_W);
 +              }
 +      } else {
 +              /*
 +               * Enable reading intercept of all the x2apic
 +               * MSRs. We should not rely on vmcs12 to do any
 +               * optimizations here, it may have been modified
 +               * by L1.
 +               */
 +              for (msr = 0x800; msr <= 0x8ff; msr++)
 +                      __vmx_enable_intercept_for_msr(
 +                              vmx_msr_bitmap_nested,
 +                              msr,
 +                              MSR_TYPE_R);
 +
 +              __vmx_enable_intercept_for_msr(
 +                              vmx_msr_bitmap_nested,
 +                              APIC_BASE_MSR + (APIC_TASKPRI >> 4),
 +                              MSR_TYPE_W);
 +              __vmx_enable_intercept_for_msr(
 +                              vmx_msr_bitmap_nested,
 +                              APIC_BASE_MSR + (APIC_EOI >> 4),
 +                              MSR_TYPE_W);
 +              __vmx_enable_intercept_for_msr(
 +                              vmx_msr_bitmap_nested,
 +                              APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
 +                              MSR_TYPE_W);
 +      }
 +      kunmap(page);
 +      nested_release_page_clean(page);
 +
 +      return true;
 +}
 +
 +static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 +                                         struct vmcs12 *vmcs12)
 +{
 +      if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
 +          !nested_cpu_has_apic_reg_virt(vmcs12) &&
 +          !nested_cpu_has_vid(vmcs12) &&
 +          !nested_cpu_has_posted_intr(vmcs12))
 +              return 0;
 +
 +      /*
 +       * If virtualize x2apic mode is enabled,
 +       * virtualize apic access must be disabled.
 +       */
 +      if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
 +          nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 +              return -EINVAL;
 +
 +      /*
 +       * If virtual interrupt delivery is enabled,
 +       * we must exit on external interrupts.
 +       */
 +      if (nested_cpu_has_vid(vmcs12) &&
 +         !nested_exit_on_intr(vcpu))
 +              return -EINVAL;
 +
 +      /*
 +       * bits 15:8 should be zero in posted_intr_nv,
 +       * the descriptor address has been already checked
 +       * in nested_get_vmcs12_pages.
 +       */
 +      if (nested_cpu_has_posted_intr(vmcs12) &&
 +         (!nested_cpu_has_vid(vmcs12) ||
 +          !nested_exit_intr_ack_set(vcpu) ||
 +          vmcs12->posted_intr_nv & 0xff00))
 +              return -EINVAL;
 +
 +      /* tpr shadow is needed by all apicv features. */
 +      if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
 +              return -EINVAL;
 +
 +      return 0;
 +}
 +
 +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 +                                     unsigned long count_field,
 +                                     unsigned long addr_field,
 +                                     int maxphyaddr)
 +{
 +      u64 count, addr;
 +
 +      if (vmcs12_read_any(vcpu, count_field, &count) ||
 +          vmcs12_read_any(vcpu, addr_field, &addr)) {
 +              WARN_ON(1);
 +              return -EINVAL;
 +      }
 +      if (count == 0)
 +              return 0;
 +      if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
 +          (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
 +              pr_warn_ratelimited(
 +                      "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
 +                      addr_field, maxphyaddr, count, addr);
 +              return -EINVAL;
 +      }
 +      return 0;
 +}
 +
 +static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
 +                                              struct vmcs12 *vmcs12)
 +{
 +      int maxphyaddr;
 +
 +      if (vmcs12->vm_exit_msr_load_count == 0 &&
 +          vmcs12->vm_exit_msr_store_count == 0 &&
 +          vmcs12->vm_entry_msr_load_count == 0)
 +              return 0; /* Fast path */
 +      maxphyaddr = cpuid_maxphyaddr(vcpu);
 +      if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
 +                                      VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
 +          nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
 +                                      VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
 +          nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
 +                                      VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
 +              return -EINVAL;
 +      return 0;
 +}
 +
 +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
 +                                     struct vmx_msr_entry *e)
 +{
 +      /* x2APIC MSR accesses are not allowed */
 +      if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
 +              return -EINVAL;
 +      if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
 +          e->index == MSR_IA32_UCODE_REV)
 +              return -EINVAL;
 +      if (e->reserved != 0)
 +              return -EINVAL;
 +      return 0;
 +}
 +
 +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
 +                                   struct vmx_msr_entry *e)
 +{
 +      if (e->index == MSR_FS_BASE ||
 +          e->index == MSR_GS_BASE ||
 +          e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
 +          nested_vmx_msr_check_common(vcpu, e))
 +              return -EINVAL;
 +      return 0;
 +}
 +
 +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
 +                                    struct vmx_msr_entry *e)
 +{
 +      if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
 +          nested_vmx_msr_check_common(vcpu, e))
 +              return -EINVAL;
 +      return 0;
 +}
 +
 +/*
 + * Load guest's/host's msr at nested entry/exit.
 + * return 0 for success, entry index for failure.
 + */
 +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 +{
 +      u32 i;
 +      struct vmx_msr_entry e;
 +      struct msr_data msr;
 +
 +      msr.host_initiated = false;
 +      for (i = 0; i < count; i++) {
 +              if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
 +                                 &e, sizeof(e))) {
 +                      pr_warn_ratelimited(
 +                              "%s cannot read MSR entry (%u, 0x%08llx)\n",
 +                              __func__, i, gpa + i * sizeof(e));
 +                      goto fail;
 +              }
 +              if (nested_vmx_load_msr_check(vcpu, &e)) {
 +                      pr_warn_ratelimited(
 +                              "%s check failed (%u, 0x%x, 0x%x)\n",
 +                              __func__, i, e.index, e.reserved);
 +                      goto fail;
 +              }
 +              msr.index = e.index;
 +              msr.data = e.value;
 +              if (kvm_set_msr(vcpu, &msr)) {
 +                      pr_warn_ratelimited(
 +                              "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
 +                              __func__, i, e.index, e.value);
 +                      goto fail;
 +              }
 +      }
 +      return 0;
 +fail:
 +      return i + 1;
 +}
 +
 +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 +{
 +      u32 i;
 +      struct vmx_msr_entry e;
 +
 +      for (i = 0; i < count; i++) {
 +              if (kvm_read_guest(vcpu->kvm,
 +                                 gpa + i * sizeof(e),
 +                                 &e, 2 * sizeof(u32))) {
 +                      pr_warn_ratelimited(
 +                              "%s cannot read MSR entry (%u, 0x%08llx)\n",
 +                              __func__, i, gpa + i * sizeof(e));
 +                      return -EINVAL;
 +              }
 +              if (nested_vmx_store_msr_check(vcpu, &e)) {
 +                      pr_warn_ratelimited(
 +                              "%s check failed (%u, 0x%x, 0x%x)\n",
 +                              __func__, i, e.index, e.reserved);
 +                      return -EINVAL;
 +              }
 +              if (kvm_get_msr(vcpu, e.index, &e.value)) {
 +                      pr_warn_ratelimited(
 +                              "%s cannot read MSR (%u, 0x%x)\n",
 +                              __func__, i, e.index);
 +                      return -EINVAL;
 +              }
 +              if (kvm_write_guest(vcpu->kvm,
 +                                  gpa + i * sizeof(e) +
 +                                      offsetof(struct vmx_msr_entry, value),
 +                                  &e.value, sizeof(e.value))) {
 +                      pr_warn_ratelimited(
 +                              "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
 +                              __func__, i, e.index, e.value);
 +                      return -EINVAL;
 +              }
 +      }
 +      return 0;
 +}
 +
  /*
   * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
   * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@@ -9086,23 -8365,8 +9086,23 @@@ static void prepare_vmcs02(struct kvm_v
  
        exec_control = vmcs12->pin_based_vm_exec_control;
        exec_control |= vmcs_config.pin_based_exec_ctrl;
 -      exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
 -                          PIN_BASED_POSTED_INTR);
 +      exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 +
 +      if (nested_cpu_has_posted_intr(vmcs12)) {
 +              /*
 +               * Note that we use L0's vector here and in
 +               * vmx_deliver_nested_posted_interrupt.
 +               */
 +              vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
 +              vmx->nested.pi_pending = false;
 +              vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
 +              vmcs_write64(POSTED_INTR_DESC_ADDR,
 +                      page_to_phys(vmx->nested.pi_desc_page) +
 +                      (unsigned long)(vmcs12->posted_intr_desc_addr &
 +                      (PAGE_SIZE - 1)));
 +      } else
 +              exec_control &= ~PIN_BASED_POSTED_INTR;
 +
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
  
        vmx->nested.preemption_timer_expired = false;
                        else
                                vmcs_write64(APIC_ACCESS_ADDR,
                                  page_to_phys(vmx->nested.apic_access_page));
 -              } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
 +              } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
 +                          (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
                        exec_control |=
                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
                        kvm_vcpu_reload_apic_access_page(vcpu);
                }
  
 +              if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
 +                      vmcs_write64(EOI_EXIT_BITMAP0,
 +                              vmcs12->eoi_exit_bitmap0);
 +                      vmcs_write64(EOI_EXIT_BITMAP1,
 +                              vmcs12->eoi_exit_bitmap1);
 +                      vmcs_write64(EOI_EXIT_BITMAP2,
 +                              vmcs12->eoi_exit_bitmap2);
 +                      vmcs_write64(EOI_EXIT_BITMAP3,
 +                              vmcs12->eoi_exit_bitmap3);
 +                      vmcs_write16(GUEST_INTR_STATUS,
 +                              vmcs12->guest_intr_status);
 +              }
 +
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
  
                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
        }
  
 +      if (cpu_has_vmx_msr_bitmap() &&
 +          exec_control & CPU_BASED_USE_MSR_BITMAPS &&
 +          nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
 +              vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
 +      } else
 +              exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
 +
        /*
 -       * Merging of IO and MSR bitmaps not currently supported.
 +       * Merging of IO bitmap not currently supported.
         * Rather, exit every time.
         */
 -      exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
        exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
        exec_control |= CPU_BASED_UNCOND_IO_EXITING;
  
@@@ -9338,7 -8582,6 +9338,7 @@@ static int nested_vmx_run(struct kvm_vc
        int cpu;
        struct loaded_vmcs *vmcs02;
        bool ia32e;
 +      u32 msr_entry_idx;
  
        if (!nested_vmx_check_permission(vcpu) ||
            !nested_vmx_check_vmcs12(vcpu))
                return 1;
        }
  
 -      if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
 -                      !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
 +      if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
                /*TODO: Also verify bits beyond physical address width are 0*/
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
  
 -      if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
 -              /*TODO: Also verify bits beyond physical address width are 0*/
 +      if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
  
 -      if (vmcs12->vm_entry_msr_load_count > 0 ||
 -          vmcs12->vm_exit_msr_load_count > 0 ||
 -          vmcs12->vm_exit_msr_store_count > 0) {
 -              pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
 -                                  __func__);
 +      if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
 +              nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 +              return 1;
 +      }
 +
 +      if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
  
        if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
 -                              nested_vmx_true_procbased_ctls_low,
 -                              nested_vmx_procbased_ctls_high) ||
 +                              vmx->nested.nested_vmx_true_procbased_ctls_low,
 +                              vmx->nested.nested_vmx_procbased_ctls_high) ||
            !vmx_control_verify(vmcs12->secondary_vm_exec_control,
 -            nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
 +                              vmx->nested.nested_vmx_secondary_ctls_low,
 +                              vmx->nested.nested_vmx_secondary_ctls_high) ||
            !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
 -            nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
 +                              vmx->nested.nested_vmx_pinbased_ctls_low,
 +                              vmx->nested.nested_vmx_pinbased_ctls_high) ||
            !vmx_control_verify(vmcs12->vm_exit_controls,
 -                              nested_vmx_true_exit_ctls_low,
 -                              nested_vmx_exit_ctls_high) ||
 +                              vmx->nested.nested_vmx_true_exit_ctls_low,
 +                              vmx->nested.nested_vmx_exit_ctls_high) ||
            !vmx_control_verify(vmcs12->vm_entry_controls,
 -                              nested_vmx_true_entry_ctls_low,
 -                              nested_vmx_entry_ctls_high))
 +                              vmx->nested.nested_vmx_true_entry_ctls_low,
 +                              vmx->nested.nested_vmx_entry_ctls_high))
        {
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
                return 1;
        }
  
 -      if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
 +      if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
            ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
                nested_vmx_entry_failure(vcpu, vmcs12,
                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
  
        vmx_segment_cache_clear(vmx);
  
 -      vmcs12->launch_state = 1;
 -
        prepare_vmcs02(vcpu, vmcs12);
  
 +      msr_entry_idx = nested_vmx_load_msr(vcpu,
 +                                          vmcs12->vm_entry_msr_load_addr,
 +                                          vmcs12->vm_entry_msr_load_count);
 +      if (msr_entry_idx) {
 +              leave_guest_mode(vcpu);
 +              vmx_load_vmcs01(vcpu);
 +              nested_vmx_entry_failure(vcpu, vmcs12,
 +                              EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
 +              return 1;
 +      }
 +
 +      vmcs12->launch_state = 1;
 +
        if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
                return kvm_emulate_halt(vcpu);
  
@@@ -9638,10 -8869,9 +9638,10 @@@ static int vmx_check_nested_events(stru
                if (vmx->nested.nested_run_pending)
                        return -EBUSY;
                nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
 +              return 0;
        }
  
 -      return 0;
 +      return vmx_complete_nested_posted_interrupt(vcpu);
  }
  
  static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@@ -9751,9 -8981,6 +9751,9 @@@ static void prepare_vmcs12(struct kvm_v
                vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
        }
  
 +      if (nested_cpu_has_vid(vmcs12))
 +              vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
 +
        vmcs12->vm_entry_controls =
                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
                (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
@@@ -9945,13 -9172,6 +9945,13 @@@ static void load_vmcs12_host_state(stru
  
        kvm_set_dr(vcpu, 7, 0x400);
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 +
 +      if (cpu_has_vmx_msr_bitmap())
 +              vmx_set_msr_bitmap(vcpu);
 +
 +      if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
 +                              vmcs12->vm_exit_msr_load_count))
 +              nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
  }
  
  /*
@@@ -9973,10 -9193,6 +9973,10 @@@ static void nested_vmx_vmexit(struct kv
        prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
                       exit_qualification);
  
 +      if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
 +                               vmcs12->vm_exit_msr_store_count))
 +              nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
 +
        vmx_load_vmcs01(vcpu);
  
        if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
                nested_release_page(vmx->nested.virtual_apic_page);
                vmx->nested.virtual_apic_page = NULL;
        }
 +      if (vmx->nested.pi_desc_page) {
 +              kunmap(vmx->nested.pi_desc_page);
 +              nested_release_page(vmx->nested.pi_desc_page);
 +              vmx->nested.pi_desc_page = NULL;
 +              vmx->nested.pi_desc = NULL;
 +      }
  
        /*
         * We are now running in L2, mmu_notifier will force to reload the
@@@ -10091,31 -9301,6 +10091,31 @@@ static void vmx_sched_in(struct kvm_vcp
                shrink_ple_window(vcpu);
  }
  
 +static void vmx_slot_enable_log_dirty(struct kvm *kvm,
 +                                   struct kvm_memory_slot *slot)
 +{
 +      kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
 +      kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
 +}
 +
 +static void vmx_slot_disable_log_dirty(struct kvm *kvm,
 +                                     struct kvm_memory_slot *slot)
 +{
 +      kvm_mmu_slot_set_dirty(kvm, slot);
 +}
 +
 +static void vmx_flush_log_dirty(struct kvm *kvm)
 +{
 +      kvm_flush_pml_buffers(kvm);
 +}
 +
 +static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
 +                                         struct kvm_memory_slot *memslot,
 +                                         gfn_t offset, unsigned long mask)
 +{
 +      kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 +}
 +
  static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
        .check_nested_events = vmx_check_nested_events,
  
        .sched_in = vmx_sched_in,
 +
 +      .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
 +      .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
 +      .flush_log_dirty = vmx_flush_log_dirty,
 +      .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
  };
  
  static int __init vmx_init(void)
diff --combined arch/x86/mm/init.c
index 649da47d382706b1001a403daac284b833fc7202,a74aa0fd185332a110eb2d7b04607c0fabd4dc30..553c094b9cd7984b7334a95122931a93249f1ddf
@@@ -173,11 -173,11 +173,11 @@@ static void __init probe_page_size_mask
  
        /* Enable PSE if available */
        if (cpu_has_pse)
-               set_in_cr4(X86_CR4_PSE);
+               cr4_set_bits_and_update_boot(X86_CR4_PSE);
  
        /* Enable PGE if available */
        if (cpu_has_pge) {
-               set_in_cr4(X86_CR4_PGE);
+               cr4_set_bits_and_update_boot(X86_CR4_PGE);
                __supported_pte_mask |= _PAGE_GLOBAL;
        }
  }
@@@ -608,7 -608,7 +608,7 @@@ void __init init_mem_mapping(void
   *
   *
   * On x86, access has to be given to the first megabyte of ram because that area
 - * contains bios code and data regions used by X and dosemu and similar apps.
 + * contains BIOS code and data regions used by X and dosemu and similar apps.
   * Access has to be given to non-kernel-ram areas as well, these contain the PCI
   * mmio resources as well as potential bios/acpi data regions.
   */
@@@ -713,6 -713,15 +713,15 @@@ void __init zone_sizes_init(void
        free_area_init_nodes(max_zone_pfns);
  }
  
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+ #ifdef CONFIG_SMP
+       .active_mm = &init_mm,
+       .state = 0,
+ #endif
+       .cr4 = ~0UL,    /* fail hard if we screw up cr4 shadow initialization */
+ };
+ EXPORT_SYMBOL_GPL(cpu_tlbstate);
  void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
  {
        /* entry 0 MUST be WB (hardwired to speed up translations) */
index 2cdc9d422bed9245bef3e0e62a007efad80be6c1,33262004c31041c69b0706eb0b07bf37931b83b8..2b621982938d4653436b3d5cfe7578c1c71e6bb9
@@@ -202,6 -202,13 +202,13 @@@ struct pmu 
         */
        int (*event_init)               (struct perf_event *event);
  
+       /*
+        * Notification that the event was mapped or unmapped.  Called
+        * in the context of the mapping task.
+        */
+       void (*event_mapped)            (struct perf_event *event); /*optional*/
+       void (*event_unmapped)          (struct perf_event *event); /*optional*/
  #define PERF_EF_START 0x01            /* start the counter when adding    */
  #define PERF_EF_RELOAD        0x02            /* reload the counter when starting */
  #define PERF_EF_UPDATE        0x04            /* update the counter when stopping */
@@@ -907,22 -914,12 +914,22 @@@ struct perf_pmu_events_attr 
        const char *event_str;
  };
  
 +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
 +                            char *page);
 +
  #define PMU_EVENT_ATTR(_name, _var, _id, _show)                               \
  static struct perf_pmu_events_attr _var = {                           \
        .attr = __ATTR(_name, 0444, _show, NULL),                       \
        .id   =  _id,                                                   \
  };
  
 +#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                          \
 +static struct perf_pmu_events_attr _var = {                               \
 +      .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
 +      .id             = 0,                                                \
 +      .event_str      = _str,                                             \
 +};
 +
  #define PMU_FORMAT_ATTR(_name, _format)                                       \
  static ssize_t                                                                \
  _name##_show(struct device *dev,                                      \
diff --combined kernel/events/core.c
index 8812d8e35f5b03b13e148ff67ae33453cad306c1,13209a90b751d11ba9e7d05334997916f7e5c609..f04daabfd1cffb78856e03b634b9d7c914faf4d1
@@@ -4101,7 -4101,8 +4101,8 @@@ unlock
        rcu_read_unlock();
  }
  
- void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+ void __weak arch_perf_update_userpage(
+       struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
  {
  }
  
@@@ -4151,7 -4152,7 +4152,7 @@@ void perf_event_update_userpage(struct 
        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);
  
-       arch_perf_update_userpage(userpg, now);
+       arch_perf_update_userpage(event, userpg, now);
  
        barrier();
        ++userpg->lock;
@@@ -4293,6 -4294,9 +4294,9 @@@ static void perf_mmap_open(struct vm_ar
  
        atomic_inc(&event->mmap_count);
        atomic_inc(&event->rb->mmap_count);
+       if (event->pmu->event_mapped)
+               event->pmu->event_mapped(event);
  }
  
  /*
@@@ -4312,6 -4316,9 +4316,9 @@@ static void perf_mmap_close(struct vm_a
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
  
+       if (event->pmu->event_unmapped)
+               event->pmu->event_unmapped(event);
        atomic_dec(&rb->mmap_count);
  
        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@@ -4513,6 -4520,9 +4520,9 @@@ unlock
        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &perf_mmap_vmops;
  
+       if (event->pmu->event_mapped)
+               event->pmu->event_mapped(event);
        return ret;
  }
  
@@@ -8508,18 -8518,6 +8518,18 @@@ void __init perf_event_init(void
                     != 1024);
  }
  
 +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
 +                            char *page)
 +{
 +      struct perf_pmu_events_attr *pmu_attr =
 +              container_of(attr, struct perf_pmu_events_attr, attr);
 +
 +      if (pmu_attr->event_str)
 +              return sprintf(page, "%s\n", pmu_attr->event_str);
 +
 +      return 0;
 +}
 +
  static int __init perf_event_sysfs_init(void)
  {
        struct pmu *pmu;
This page took 0.279838 seconds and 4 git commands to generate.