Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <[email protected]>

Mon, 16 Feb 2015 22:58:12 +0000 (14:58 -0800)

committer Linus Torvalds <[email protected]>

Mon, 16 Feb 2015 22:58:12 +0000 (14:58 -0800)
author Linus Torvalds <[email protected]>
Mon, 16 Feb 2015 22:58:12 +0000 (14:58 -0800)
committer Linus Torvalds <[email protected]>
Mon, 16 Feb 2015 22:58:12 +0000 (14:58 -0800)
diff --combined arch/x86/kernel/cpu/common.c

index b15bffcaba6d41fdcb1fff7d813aa3fd09f839ac,07f2fc3c13a4d614b76fdad3b99ee6ec8e9c3683..b5c8ff5e9dfcad79075a1af5f41b0ac0ee37231b
--- 1/arch/x86/kernel/cpu/common.c
--- 2/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -19,6 -19,7 +19,7 @@@
   #include <asm/archrandom.h>
   #include <asm/hypervisor.h>
   #include <asm/processor.h>
+ #include <asm/tlbflush.h>
   #include <asm/debugreg.h>
   #include <asm/sections.h>
   #include <asm/vsyscall.h>
@@@ -278,7 -279,7 +279,7 @@@ __setup("nosmep", setup_disable_smep)
   static __always_inline void setup_smep(struct cpuinfo_x86 *c)
   {
         if (cpu_has(c, X86_FEATURE_SMEP))
-               set_in_cr4(X86_CR4_SMEP);
+               cr4_set_bits(X86_CR4_SMEP);
   }
   
   static __init int setup_disable_smap(char *arg)
@@@ -298,9 -299,9 +299,9 @@@ static __always_inline void setup_smap(
   
         if (cpu_has(c, X86_FEATURE_SMAP)) {
   #ifdef CONFIG_X86_SMAP
-               set_in_cr4(X86_CR4_SMAP);
+               cr4_set_bits(X86_CR4_SMAP);
   #else
-               clear_in_cr4(X86_CR4_SMAP);
+               cr4_clear_bits(X86_CR4_SMAP);
   #endif
         }
   }
@@@ -491,18 -492,17 +492,18 @@@ u16 __read_mostly tlb_lld_2m[NR_INFO]
   u16 __read_mostly tlb_lld_4m[NR_INFO];
   u16 __read_mostly tlb_lld_1g[NR_INFO];
   
- -void cpu_detect_tlb(struct cpuinfo_x86 *c)
+ +static void cpu_detect_tlb(struct cpuinfo_x86 *c)
   {
         if (this_cpu->c_detect_tlb)
                 this_cpu->c_detect_tlb(c);
   
- -      printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
- -              "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ +      pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
                 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
- -              tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
- -              tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
- -              tlb_lld_1g[ENTRIES]);
+ +              tlb_lli_4m[ENTRIES]);
+ +
+ +      pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ +              tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
+ +              tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
   }
   
   void detect_ht(struct cpuinfo_x86 *c)
@@@ -1294,6 -1294,12 +1295,12 @@@ void cpu_init(void
   
         wait_for_master_cpu(cpu);
   
+       /*
+        * Initialize the CR4 shadow before doing anything that could
+        * try to read it.
+        */
+       cr4_init_shadow();
+ 
         /*
          * Load microcode on this cpu if a valid microcode is available.
          * This is early microcode loading procedure.
@@@ -1313,7 -1319,7 +1320,7 @@@
   
         pr_debug("Initializing CPU#%d\n", cpu);
   
-       clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+       cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
   
         /*
          * Initialize the per-CPU GDT with the boot GDT,
@@@ -1333,7 -1339,7 +1340,7 @@@
         barrier();
   
         x86_configure_nx();
- -      enable_x2apic();
+ +      x2apic_setup();
   
         /*
          * set up and load the per-CPU TSS
@@@ -1394,7 -1400,7 +1401,7 @@@ void cpu_init(void
         printk(KERN_INFO "Initializing CPU#%d\n", cpu);
   
         if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
-               clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+               cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
   
         load_current_idt();
         switch_to_new_gdt(cpu);
diff --combined arch/x86/kernel/cpu/mcheck/mce.c

index cdfed7953963fa31251c24c254f55a267951bf02,15ad3ed1a3cd3f4e7f626c2f77c079f6d0de3249..3be9fa69f8758d61872888f0fcd9bc26692ae0ae
--- 1/arch/x86/kernel/cpu/mcheck/mce.c
--- 2/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@@ -44,6 -44,7 +44,7 @@@
   
   #include <asm/processor.h>
   #include <asm/traps.h>
+ #include <asm/tlbflush.h>
   #include <asm/mce.h>
   #include <asm/msr.h>
   
@@@ -116,7 -117,7 +117,7 @@@ static void (*quirk_no_way_out)(int ban
    * CPU/chipset specific EDAC code can register a notifier call here to print
    * MCE errors in a human-readable form.
    */
- -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+ +static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
   
   /* Do initial initialization of a struct mce */
   void mce_setup(struct mce *m)
@@@ -312,7 -313,7 +313,7 @@@ static void wait_for_panic(void
         panic("Panicing machine check CPU died");
   }
   
- -static void mce_panic(char *msg, struct mce *final, char *exp)
+ +static void mce_panic(const char *msg, struct mce *final, char *exp)
   {
         int i, apei_err = 0;
   
@@@ -530,7 -531,7 +531,7 @@@ static void mce_schedule_work(void
                 schedule_work(this_cpu_ptr(&mce_work));
   }
   
- -DEFINE_PER_CPU(struct irq_work, mce_irq_work);
+ +static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
   
   static void mce_irq_work_cb(struct irq_work *entry)
   {
@@@ -736,7 -737,7 +737,7 @@@ static atomic_t mce_callin
   /*
    * Check if a timeout waiting for other CPUs happened.
    */
- -static int mce_timed_out(u64 *t)
+ +static int mce_timed_out(u64 *t, const char *msg)
   {
         /*
          * The others already did panic for some reason.
@@@ -751,7 -752,8 +752,7 @@@
                 goto out;
         if ((s64)*t < SPINUNIT) {
                 if (mca_cfg.tolerant <= 1)
- -                      mce_panic("Timeout synchronizing machine check over CPUs",
- -                                NULL, NULL);
+ +                      mce_panic(msg, NULL, NULL);
                 cpu_missing = 1;
                 return 1;
         }
@@@ -867,8 -869,7 +868,8 @@@ static int mce_start(int *no_way_out
          * Wait for everyone.
          */
         while (atomic_read(&mce_callin) != cpus) {
- -              if (mce_timed_out(&timeout)) {
+ +              if (mce_timed_out(&timeout,
+ +                                "Timeout: Not all CPUs entered broadcast exception handler")) {
                         atomic_set(&global_nwo, 0);
                         return -1;
                 }
@@@ -893,8 -894,7 +894,8 @@@
                  * only seen by one CPU before cleared, avoiding duplicates.
                  */
                 while (atomic_read(&mce_executing) < order) {
- -                      if (mce_timed_out(&timeout)) {
+ +                      if (mce_timed_out(&timeout,
+ +                                        "Timeout: Subject CPUs unable to finish machine check processing")) {
                                 atomic_set(&global_nwo, 0);
                                 return -1;
                         }
@@@ -938,8 -938,7 +939,8 @@@ static int mce_end(int order
                  * loops.
                  */
                 while (atomic_read(&mce_executing) <= cpus) {
- -                      if (mce_timed_out(&timeout))
+ +                      if (mce_timed_out(&timeout,
+ +                                        "Timeout: Monarch CPU unable to finish machine check processing"))
                                 goto reset;
                         ndelay(SPINUNIT);
                 }
@@@ -952,8 -951,7 +953,8 @@@
                  * Subject: Wait for Monarch to finish.
                  */
                 while (atomic_read(&mce_executing) != 0) {
- -                      if (mce_timed_out(&timeout))
+ +                      if (mce_timed_out(&timeout,
+ +                                        "Timeout: Monarch CPU did not finish machine check processing"))
                                 goto reset;
                         ndelay(SPINUNIT);
                 }
@@@ -1452,7 -1450,7 +1453,7 @@@ static void __mcheck_cpu_init_generic(v
         bitmap_fill(all_banks, MAX_NR_BANKS);
         machine_check_poll(MCP_UC | m_fl, &all_banks);
   
-       set_in_cr4(X86_CR4_MCE);
+       cr4_set_bits(X86_CR4_MCE);
   
         rdmsrl(MSR_IA32_MCG_CAP, cap);
         if (cap & MCG_CTL_P)
diff --combined arch/x86/kernel/head64.c

index efcddfaf05f9f94f41b20d1a5a99c851e9b09a3b,3b241f0ca005fcfc9a157d2bdff04ad69f0d34f8..c4f8d4659070db99ce190543186bc4a4ac5d2ac9
--- 1/arch/x86/kernel/head64.c
--- 2/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@@ -27,7 -27,6 +27,7 @@@
   #include <asm/bios_ebda.h>
   #include <asm/bootparam_utils.h>
   #include <asm/microcode.h>
+ +#include <asm/kasan.h>
   
   /*
    * Manage page tables very early on.
@@@ -47,7 -46,7 +47,7 @@@ static void __init reset_early_page_tab
   
         next_early_pgt = 0;
   
- -      write_cr3(__pa(early_level4_pgt));
+ +      write_cr3(__pa_nodebug(early_level4_pgt));
   }
   
   /* Create a new PMD entry */
@@@ -60,7 -59,7 +60,7 @@@ int __init early_make_pgtable(unsigned 
         pmdval_t pmd, *pmd_p;
   
         /* Invalid address or early pgt is done ?  */
- -      if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+ +      if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
                 return -1;
   
   again:
@@@ -156,11 -155,11 +156,13 @@@ asmlinkage __visible void __init x86_64
                                 (__START_KERNEL & PGDIR_MASK)));
         BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
   
+       cr4_init_shadow();
+ 
         /* Kill off the identity-map trampoline */
         reset_early_page_tables();
   
+ +      kasan_map_early_shadow(early_level4_pgt);
+ +
         /* clear bss before set_intr_gate with early_idt_handler */
         clear_bss();
   
@@@ -182,8 -181,6 +184,8 @@@
         /* set init_level4_pgt kernel high mapping*/
         init_level4_pgt[511] = early_level4_pgt[511];
   
+ +      kasan_map_early_shadow(init_level4_pgt);
+ +
         x86_64_start_reservations(real_mode_data);
   }
   
diff --combined arch/x86/kernel/i387.c

index 81049ffab2d601cf67ce6bdf455edb4d65abbc46,87727b03196da51aa6aaf68de607485d262e5fab..d5651fce0b71af6c15226483b6a06398ccbeb8a0
--- 1/arch/x86/kernel/i387.c
--- 2/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@@ -13,25 -13,13 +13,26 @@@
   #include <asm/sigcontext.h>
   #include <asm/processor.h>
   #include <asm/math_emu.h>
+ #include <asm/tlbflush.h>
   #include <asm/uaccess.h>
   #include <asm/ptrace.h>
   #include <asm/i387.h>
   #include <asm/fpu-internal.h>
   #include <asm/user.h>
   
+ +static DEFINE_PER_CPU(bool, in_kernel_fpu);
+ +
+ +void kernel_fpu_disable(void)
+ +{
+ +      WARN_ON(this_cpu_read(in_kernel_fpu));
+ +      this_cpu_write(in_kernel_fpu, true);
+ +}
+ +
+ +void kernel_fpu_enable(void)
+ +{
+ +      this_cpu_write(in_kernel_fpu, false);
+ +}
+ +
   /*
    * Were we in an interrupt that interrupted kernel mode?
    *
@@@ -46,9 -34,6 +47,9 @@@
    */
   static inline bool interrupted_kernel_fpu_idle(void)
   {
+ +      if (this_cpu_read(in_kernel_fpu))
+ +              return false;
+ +
         if (use_eager_fpu())
                 return __thread_has_fpu(current);
   
@@@ -89,10 -74,10 +90,10 @@@ void __kernel_fpu_begin(void
   {
         struct task_struct *me = current;
   
+ +      this_cpu_write(in_kernel_fpu, true);
+ +
         if (__thread_has_fpu(me)) {
- -              __thread_clear_has_fpu(me);
                 __save_init_fpu(me);
- -              /* We do 'stts()' in __kernel_fpu_end() */
         } else if (!use_eager_fpu()) {
                 this_cpu_write(fpu_owner_task, NULL);
                 clts();
@@@ -102,16 -87,19 +103,16 @@@ EXPORT_SYMBOL(__kernel_fpu_begin)
   
   void __kernel_fpu_end(void)
   {
- -      if (use_eager_fpu()) {
- -              /*
- -               * For eager fpu, most the time, tsk_used_math() is true.
- -               * Restore the user math as we are done with the kernel usage.
- -               * At few instances during thread exit, signal handling etc,
- -               * tsk_used_math() is false. Those few places will take proper
- -               * actions, so we don't need to restore the math here.
- -               */
- -              if (likely(tsk_used_math(current)))
- -                      math_state_restore();
- -      } else {
+ +      struct task_struct *me = current;
+ +
+ +      if (__thread_has_fpu(me)) {
+ +              if (WARN_ON(restore_fpu_checking(me)))
+ +                      drop_init_fpu(me);
+ +      } else if (!use_eager_fpu()) {
                 stts();
         }
+ +
+ +      this_cpu_write(in_kernel_fpu, false);
   }
   EXPORT_SYMBOL(__kernel_fpu_end);
   
@@@ -193,7 -181,7 +194,7 @@@ void fpu_init(void
         if (cpu_has_xmm)
                 cr4_mask |= X86_CR4_OSXMMEXCPT;
         if (cr4_mask)
-               set_in_cr4(cr4_mask);
+               cr4_set_bits(cr4_mask);
   
         cr0 = read_cr0();
         cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --combined arch/x86/kernel/setup.c

index 27d2009298646990d1b4956307ab192b030aa211,04e6c62f1a9386535f9b1e4f5156e37ec14f402d..0a2421cca01fad095bbb7caa8e7c779d910d751b
--- 1/arch/x86/kernel/setup.c
--- 2/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@@ -89,7 -89,6 +89,7 @@@
   #include <asm/cacheflush.h>
   #include <asm/processor.h>
   #include <asm/bugs.h>
+ +#include <asm/kasan.h>
   
   #include <asm/vsyscall.h>
   #include <asm/cpu.h>
@@@ -432,13 -431,15 +432,13 @@@ static void __init parse_setup_data(voi
   
         pa_data = boot_params.hdr.setup_data;
         while (pa_data) {
- -              u32 data_len, map_len, data_type;
+ +              u32 data_len, data_type;
   
- -              map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
- -                            (u64)sizeof(struct setup_data));
- -              data = early_memremap(pa_data, map_len);
+ +              data = early_memremap(pa_data, sizeof(*data));
                 data_len = data->len + sizeof(struct setup_data);
                 data_type = data->type;
                 pa_next = data->next;
- -              early_iounmap(data, map_len);
+ +              early_iounmap(data, sizeof(*data));
   
                 switch (data_type) {
                 case SETUP_E820_EXT:
@@@ -1175,11 -1176,9 +1175,11 @@@ void __init setup_arch(char **cmdline_p
   
         x86_init.paging.pagetable_init();
   
+ +      kasan_init();
+ +
         if (boot_cpu_data.cpuid_level >= 0) {
                 /* A CPU has %cr4 if and only if it has CPUID */
-               mmu_cr4_features = read_cr4();
+               mmu_cr4_features = __read_cr4();
                 if (trampoline_cr4_features)
                         *trampoline_cr4_features = mmu_cr4_features;
         }
diff --combined arch/x86/kvm/svm.c

index a17d848c6d42d0d9ba292b06ec9df7dd857fe790,496a54839968e4c8389b67c676f2a3577a3bc785..d319e0c24758876178aeab46c65fe611cb02126e
--- 1/arch/x86/kvm/svm.c
--- 2/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@@ -1583,7 -1583,7 +1583,7 @@@ static void svm_set_cr0(struct kvm_vcp
   
   static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
   {
-       unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+       unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
         unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
   
         if (cr4 & X86_CR4_VMXE)
@@@ -2003,8 -2003,8 +2003,8 @@@ static void nested_svm_inject_npf_exit(
   
   static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
   {
- -      kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
- -
+ +      WARN_ON(mmu_is_nested(vcpu));
+ +      kvm_init_shadow_mmu(vcpu);
         vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
         vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
         vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
diff --combined arch/x86/kvm/vmx.c

index 3f73bfad0349e74042a9315ec20f978a9b5bec85,8dca6ccbb9cefcdb960d8e4ceb62366ef4729ebc..14c1a18d206aeee0d59637162b0f1a58056c8941
--- 1/arch/x86/kvm/vmx.c
--- 2/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -45,7 -45,6 +45,7 @@@
   #include <asm/perf_event.h>
   #include <asm/debugreg.h>
   #include <asm/kexec.h>
+ +#include <asm/apic.h>
   
   #include "trace.h"
   
@@@ -102,9 -101,6 +102,9 @@@ module_param(nested, bool, S_IRUGO)
   
   static u64 __read_mostly host_xss;
   
+ +static bool __read_mostly enable_pml = 1;
+ +module_param_named(pml, enable_pml, bool, S_IRUGO);
+ +
   #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
   #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
   #define KVM_VM_CR0_ALWAYS_ON                                          \
@@@ -219,12 -215,7 +219,12 @@@ struct __packed vmcs12 
         u64 tsc_offset;
         u64 virtual_apic_page_addr;
         u64 apic_access_addr;
+ +      u64 posted_intr_desc_addr;
         u64 ept_pointer;
+ +      u64 eoi_exit_bitmap0;
+ +      u64 eoi_exit_bitmap1;
+ +      u64 eoi_exit_bitmap2;
+ +      u64 eoi_exit_bitmap3;
         u64 xss_exit_bitmap;
         u64 guest_physical_address;
         u64 vmcs_link_pointer;
@@@ -339,7 -330,6 +339,7 @@@
         u32 vmx_preemption_timer_value;
         u32 padding32[7]; /* room for future expansion */
         u16 virtual_processor_id;
+ +      u16 posted_intr_nv;
         u16 guest_es_selector;
         u16 guest_cs_selector;
         u16 guest_ss_selector;
@@@ -348,7 -338,6 +348,7 @@@
         u16 guest_gs_selector;
         u16 guest_ldtr_selector;
         u16 guest_tr_selector;
+ +      u16 guest_intr_status;
         u16 host_es_selector;
         u16 host_cs_selector;
         u16 host_ss_selector;
@@@ -412,10 -401,6 +412,10 @@@ struct nested_vmx 
          */
         struct page *apic_access_page;
         struct page *virtual_apic_page;
+ +      struct page *pi_desc_page;
+ +      struct pi_desc *pi_desc;
+ +      bool pi_pending;
+ +      u16 posted_intr_nv;
         u64 msr_ia32_feature_control;
   
         struct hrtimer preemption_timer;
@@@ -423,23 -408,6 +423,23 @@@
   
         /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
         u64 vmcs01_debugctl;
+ +
+ +      u32 nested_vmx_procbased_ctls_low;
+ +      u32 nested_vmx_procbased_ctls_high;
+ +      u32 nested_vmx_true_procbased_ctls_low;
+ +      u32 nested_vmx_secondary_ctls_low;
+ +      u32 nested_vmx_secondary_ctls_high;
+ +      u32 nested_vmx_pinbased_ctls_low;
+ +      u32 nested_vmx_pinbased_ctls_high;
+ +      u32 nested_vmx_exit_ctls_low;
+ +      u32 nested_vmx_exit_ctls_high;
+ +      u32 nested_vmx_true_exit_ctls_low;
+ +      u32 nested_vmx_entry_ctls_low;
+ +      u32 nested_vmx_entry_ctls_high;
+ +      u32 nested_vmx_true_entry_ctls_low;
+ +      u32 nested_vmx_misc_low;
+ +      u32 nested_vmx_misc_high;
+ +      u32 nested_vmx_ept_caps;
   };
   
   #define POSTED_INTR_ON  0
@@@ -543,10 -511,6 +543,10 @@@ struct vcpu_vmx 
         /* Dynamic PLE window. */
         int ple_window;
         bool ple_window_dirty;
+ +
+ +      /* Support for PML */
+ +#define PML_ENTITY_NUM                512
+ +      struct page *pml_pg;
   };
   
   enum segment_cache_field {
@@@ -630,7 -594,6 +630,7 @@@ static int max_shadow_read_write_field
   
   static const unsigned short vmcs_field_to_offset_table[] = {
         FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+ +      FIELD(POSTED_INTR_NV, posted_intr_nv),
         FIELD(GUEST_ES_SELECTOR, guest_es_selector),
         FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
         FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@@ -639,7 -602,6 +639,7 @@@
         FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
         FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
         FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+ +      FIELD(GUEST_INTR_STATUS, guest_intr_status),
         FIELD(HOST_ES_SELECTOR, host_es_selector),
         FIELD(HOST_CS_SELECTOR, host_cs_selector),
         FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@@ -656,12 -618,7 +656,12 @@@
         FIELD64(TSC_OFFSET, tsc_offset),
         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+ +      FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
         FIELD64(EPT_POINTER, ept_pointer),
+ +      FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+ +      FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+ +      FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+ +      FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
         FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@@ -809,7 -766,6 +809,7 @@@ static void kvm_cpu_vmxon(u64 addr)
   static void kvm_cpu_vmxoff(void);
   static bool vmx_mpx_supported(void);
   static bool vmx_xsaves_supported(void);
+ +static int vmx_vm_has_apicv(struct kvm *kvm);
   static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
   static void vmx_set_segment(struct kvm_vcpu *vcpu,
                             struct kvm_segment *var, int seg);
@@@ -837,7 -793,6 +837,7 @@@ static unsigned long *vmx_msr_bitmap_le
   static unsigned long *vmx_msr_bitmap_longmode;
   static unsigned long *vmx_msr_bitmap_legacy_x2apic;
   static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+ +static unsigned long *vmx_msr_bitmap_nested;
   static unsigned long *vmx_vmread_bitmap;
   static unsigned long *vmx_vmwrite_bitmap;
   
@@@ -1004,6 -959,16 +1004,6 @@@ static inline bool cpu_has_vmx_ept_exec
         return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
   }
   
- -static inline bool cpu_has_vmx_eptp_uncacheable(void)
- -{
- -      return vmx_capability.ept & VMX_EPTP_UC_BIT;
- -}
- -
- -static inline bool cpu_has_vmx_eptp_writeback(void)
- -{
- -      return vmx_capability.ept & VMX_EPTP_WB_BIT;
- -}
- -
   static inline bool cpu_has_vmx_ept_2m_page(void)
   {
         return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
@@@ -1108,11 -1073,6 +1108,11 @@@ static inline bool cpu_has_vmx_shadow_v
                 SECONDARY_EXEC_SHADOW_VMCS;
   }
   
+ +static inline bool cpu_has_vmx_pml(void)
+ +{
+ +      return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
+ +}
+ +
   static inline bool report_flexpriority(void)
   {
         return flexpriority_enabled;
@@@ -1152,26 -1112,6 +1152,26 @@@ static inline bool nested_cpu_has_xsave
                 vmx_xsaves_supported();
   }
   
+ +static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+ +{
+ +      return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+ +}
+ +
+ +static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+ +{
+ +      return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+ +}
+ +
+ +static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+ +{
+ +      return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ +}
+ +
+ +static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+ +{
+ +      return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+ +}
+ +
   static inline bool is_exception(u32 intr_info)
   {
         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@@ -2344,8 -2284,20 +2344,8 @@@ static inline bool nested_vmx_allowed(s
    * if the corresponding bit in the (32-bit) control field *must* be on, and a
    * bit in the high half is on if the corresponding bit in the control field
    * may be on. See also vmx_control_verify().
- - * TODO: allow these variables to be modified (downgraded) by module options
- - * or other means.
    */
- -static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
- -static u32 nested_vmx_true_procbased_ctls_low;
- -static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
- -static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
- -static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
- -static u32 nested_vmx_true_exit_ctls_low;
- -static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
- -static u32 nested_vmx_true_entry_ctls_low;
- -static u32 nested_vmx_misc_low, nested_vmx_misc_high;
- -static u32 nested_vmx_ept_caps;
- -static __init void nested_vmx_setup_ctls_msrs(void)
+ +static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
   {
         /*
          * Note that as a general rule, the high half of the MSRs (bits in
@@@ -2364,74 -2316,57 +2364,74 @@@
   
         /* pin-based controls */
         rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- -            nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
- -      nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- -      nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
- -              PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
- -      nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ +              vmx->nested.nested_vmx_pinbased_ctls_low,
+ +              vmx->nested.nested_vmx_pinbased_ctls_high);
+ +      vmx->nested.nested_vmx_pinbased_ctls_low |=
+ +              PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ +      vmx->nested.nested_vmx_pinbased_ctls_high &=
+ +              PIN_BASED_EXT_INTR_MASK |
+ +              PIN_BASED_NMI_EXITING |
+ +              PIN_BASED_VIRTUAL_NMIS;
+ +      vmx->nested.nested_vmx_pinbased_ctls_high |=
+ +              PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                 PIN_BASED_VMX_PREEMPTION_TIMER;
+ +      if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+ +              vmx->nested.nested_vmx_pinbased_ctls_high |=
+ +                      PIN_BASED_POSTED_INTR;
   
         /* exit controls */
         rdmsr(MSR_IA32_VMX_EXIT_CTLS,
- -              nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
- -      nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ +              vmx->nested.nested_vmx_exit_ctls_low,
+ +              vmx->nested.nested_vmx_exit_ctls_high);
+ +      vmx->nested.nested_vmx_exit_ctls_low =
+ +              VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
   
- -      nested_vmx_exit_ctls_high &=
+ +      vmx->nested.nested_vmx_exit_ctls_high &=
   #ifdef CONFIG_X86_64
                 VM_EXIT_HOST_ADDR_SPACE_SIZE |
   #endif
                 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
- -      nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ +      vmx->nested.nested_vmx_exit_ctls_high |=
+ +              VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
   
         if (vmx_mpx_supported())
- -              nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ +              vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
   
         /* We support free control of debug control saving. */
- -      nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
+ +      vmx->nested.nested_vmx_true_exit_ctls_low =
+ +              vmx->nested.nested_vmx_exit_ctls_low &
                 ~VM_EXIT_SAVE_DEBUG_CONTROLS;
   
         /* entry controls */
         rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
- -              nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
- -      nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
- -      nested_vmx_entry_ctls_high &=
+ +              vmx->nested.nested_vmx_entry_ctls_low,
+ +              vmx->nested.nested_vmx_entry_ctls_high);
+ +      vmx->nested.nested_vmx_entry_ctls_low =
+ +              VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ +      vmx->nested.nested_vmx_entry_ctls_high &=
   #ifdef CONFIG_X86_64
                 VM_ENTRY_IA32E_MODE |
   #endif
                 VM_ENTRY_LOAD_IA32_PAT;
- -      nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
- -                                     VM_ENTRY_LOAD_IA32_EFER);
+ +      vmx->nested.nested_vmx_entry_ctls_high |=
+ +              (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
         if (vmx_mpx_supported())
- -              nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ +              vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
   
         /* We support free control of debug control loading. */
- -      nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
+ +      vmx->nested.nested_vmx_true_entry_ctls_low =
+ +              vmx->nested.nested_vmx_entry_ctls_low &
                 ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
   
         /* cpu-based controls */
         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
- -              nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
- -      nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- -      nested_vmx_procbased_ctls_high &=
+ +              vmx->nested.nested_vmx_procbased_ctls_low,
+ +              vmx->nested.nested_vmx_procbased_ctls_high);
+ +      vmx->nested.nested_vmx_procbased_ctls_low =
+ +              CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ +      vmx->nested.nested_vmx_procbased_ctls_high &=
                 CPU_BASED_VIRTUAL_INTR_PENDING |
                 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
                 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@@ -2451,55 -2386,45 +2451,55 @@@
          * can use it to avoid exits to L1 - even when L0 runs L2
          * without MSR bitmaps.
          */
- -      nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ +      vmx->nested.nested_vmx_procbased_ctls_high |=
+ +              CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                 CPU_BASED_USE_MSR_BITMAPS;
   
         /* We support free control of CR3 access interception. */
- -      nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
+ +      vmx->nested.nested_vmx_true_procbased_ctls_low =
+ +              vmx->nested.nested_vmx_procbased_ctls_low &
                 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
   
         /* secondary cpu-based controls */
         rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- -              nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
- -      nested_vmx_secondary_ctls_low = 0;
- -      nested_vmx_secondary_ctls_high &=
+ +              vmx->nested.nested_vmx_secondary_ctls_low,
+ +              vmx->nested.nested_vmx_secondary_ctls_high);
+ +      vmx->nested.nested_vmx_secondary_ctls_low = 0;
+ +      vmx->nested.nested_vmx_secondary_ctls_high &=
                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ +              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ +              SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ +              SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                 SECONDARY_EXEC_WBINVD_EXITING |
                 SECONDARY_EXEC_XSAVES;
   
         if (enable_ept) {
                 /* nested EPT: emulate EPT also to L1 */
- -              nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+ +              vmx->nested.nested_vmx_secondary_ctls_high |=
+ +                      SECONDARY_EXEC_ENABLE_EPT |
                         SECONDARY_EXEC_UNRESTRICTED_GUEST;
- -              nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+ +              vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                          VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                          VMX_EPT_INVEPT_BIT;
- -              nested_vmx_ept_caps &= vmx_capability.ept;
+ +              vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
                 /*
                  * For nested guests, we don't do anything specific
                  * for single context invalidation. Hence, only advertise
                  * support for global context invalidation.
                  */
- -              nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
+ +              vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
         } else
- -              nested_vmx_ept_caps = 0;
+ +              vmx->nested.nested_vmx_ept_caps = 0;
   
         /* miscellaneous data */
- -      rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
- -      nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
- -      nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+ +      rdmsr(MSR_IA32_VMX_MISC,
+ +              vmx->nested.nested_vmx_misc_low,
+ +              vmx->nested.nested_vmx_misc_high);
+ +      vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ +      vmx->nested.nested_vmx_misc_low |=
+ +              VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
                 VMX_MISC_ACTIVITY_HLT;
- -      nested_vmx_misc_high = 0;
+ +      vmx->nested.nested_vmx_misc_high = 0;
   }
   
   static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@@ -2518,8 -2443,6 +2518,8 @@@ static inline u64 vmx_control_msr(u32 l
   /* Returns 0 on success, non-0 otherwise. */
   static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
   {
+ +      struct vcpu_vmx *vmx = to_vmx(vcpu);
+ +
         switch (msr_index) {
         case MSR_IA32_VMX_BASIC:
                 /*
@@@ -2534,44 -2457,36 +2534,44 @@@
                 break;
         case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
         case MSR_IA32_VMX_PINBASED_CTLS:
- -              *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
- -                                      nested_vmx_pinbased_ctls_high);
+ +              *pdata = vmx_control_msr(
+ +                      vmx->nested.nested_vmx_pinbased_ctls_low,
+ +                      vmx->nested.nested_vmx_pinbased_ctls_high);
                 break;
         case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- -              *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
- -                                      nested_vmx_procbased_ctls_high);
+ +              *pdata = vmx_control_msr(
+ +                      vmx->nested.nested_vmx_true_procbased_ctls_low,
+ +                      vmx->nested.nested_vmx_procbased_ctls_high);
                 break;
         case MSR_IA32_VMX_PROCBASED_CTLS:
- -              *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
- -                                      nested_vmx_procbased_ctls_high);
+ +              *pdata = vmx_control_msr(
+ +                      vmx->nested.nested_vmx_procbased_ctls_low,
+ +                      vmx->nested.nested_vmx_procbased_ctls_high);
                 break;
         case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- -              *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
- -                                      nested_vmx_exit_ctls_high);
+ +              *pdata = vmx_control_msr(
+ +                      vmx->nested.nested_vmx_true_exit_ctls_low,
+ +                      vmx->nested.nested_vmx_exit_ctls_high);
                 break;
         case MSR_IA32_VMX_EXIT_CTLS:
- -              *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
- -                                      nested_vmx_exit_ctls_high);
+ +              *pdata = vmx_control_msr(
+ +                      vmx->nested.nested_vmx_exit_ctls_low,
+ +                      vmx->nested.nested_vmx_exit_ctls_high);
                 break;
         case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- -              *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
- -                                      nested_vmx_entry_ctls_high);
+ +              *pdata = vmx_control_msr(
+ +                      vmx->nested.nested_vmx_true_entry_ctls_low,
+ +                      vmx->nested.nested_vmx_entry_ctls_high);
                 break;
         case MSR_IA32_VMX_ENTRY_CTLS:
- -              *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
- -                                      nested_vmx_entry_ctls_high);
+ +              *pdata = vmx_control_msr(
+ +                      vmx->nested.nested_vmx_entry_ctls_low,
+ +                      vmx->nested.nested_vmx_entry_ctls_high);
                 break;
         case MSR_IA32_VMX_MISC:
- -              *pdata = vmx_control_msr(nested_vmx_misc_low,
- -                                       nested_vmx_misc_high);
+ +              *pdata = vmx_control_msr(
+ +                      vmx->nested.nested_vmx_misc_low,
+ +                      vmx->nested.nested_vmx_misc_high);
                 break;
         /*
          * These MSRs specify bits which the guest must keep fixed (on or off)
@@@ -2596,13 -2511,12 +2596,13 @@@
                 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
                 break;
         case MSR_IA32_VMX_PROCBASED_CTLS2:
- -              *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
- -                                      nested_vmx_secondary_ctls_high);
+ +              *pdata = vmx_control_msr(
+ +                      vmx->nested.nested_vmx_secondary_ctls_low,
+ +                      vmx->nested.nested_vmx_secondary_ctls_high);
                 break;
         case MSR_IA32_VMX_EPT_VPID_CAP:
                 /* Currently, no nested vpid support */
- -              *pdata = nested_vmx_ept_caps;
+ +              *pdata = vmx->nested.nested_vmx_ept_caps;
                 break;
         default:
                 return 1;
@@@ -2871,7 -2785,7 +2871,7 @@@ static int hardware_enable(void
         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
         u64 old, test_bits;
   
-       if (read_cr4() & X86_CR4_VMXE)
+       if (cr4_read_shadow() & X86_CR4_VMXE)
                 return -EBUSY;
   
         INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
@@@ -2898,7 -2812,7 +2898,7 @@@
                 /* enable and lock */
                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
         }
-       write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+       cr4_set_bits(X86_CR4_VMXE);
   
         if (vmm_exclusive) {
                 kvm_cpu_vmxon(phys_addr);
@@@ -2935,7 -2849,7 +2935,7 @@@ static void hardware_disable(void
                 vmclear_local_loaded_vmcss();
                 kvm_cpu_vmxoff();
         }
-       write_cr4(read_cr4() & ~X86_CR4_VMXE);
+       cr4_clear_bits(X86_CR4_VMXE);
   }
   
   static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@@ -3015,8 -2929,7 +3015,8 @@@ static __init int setup_vmcs_config(str
                         SECONDARY_EXEC_APIC_REGISTER_VIRT |
                         SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                         SECONDARY_EXEC_SHADOW_VMCS |
- -                      SECONDARY_EXEC_XSAVES;
+ +                      SECONDARY_EXEC_XSAVES |
+ +                      SECONDARY_EXEC_ENABLE_PML;
                 if (adjust_vmx_controls(min2, opt2,
                                         MSR_IA32_VMX_PROCBASED_CTLS2,
                                         &_cpu_based_2nd_exec_control) < 0)
@@@ -4246,52 -4159,6 +4246,52 @@@ static void __vmx_enable_intercept_for_
         }
   }
   
+ +/*
+ + * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ + * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ + */
+ +static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+ +                                             unsigned long *msr_bitmap_nested,
+ +                                             u32 msr, int type)
+ +{
+ +      int f = sizeof(unsigned long);
+ +
+ +      if (!cpu_has_vmx_msr_bitmap()) {
+ +              WARN_ON(1);
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ +       * have the write-low and read-high bitmap offsets the wrong way round.
+ +       * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ +       */
+ +      if (msr <= 0x1fff) {
+ +              if (type & MSR_TYPE_R &&
+ +                 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+ +                      /* read-low */
+ +                      __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+ +
+ +              if (type & MSR_TYPE_W &&
+ +                 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+ +                      /* write-low */
+ +                      __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+ +
+ +      } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ +              msr &= 0x1fff;
+ +              if (type & MSR_TYPE_R &&
+ +                 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+ +                      /* read-high */
+ +                      __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+ +
+ +              if (type & MSR_TYPE_W &&
+ +                 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+ +                      /* write-high */
+ +                      __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+ +
+ +      }
+ +}
+ +
   static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
   {
         if (!longmode_only)
@@@ -4330,64 -4197,6 +4330,64 @@@ static int vmx_vm_has_apicv(struct kvm 
         return enable_apicv && irqchip_in_kernel(kvm);
   }
   
+ +static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+ +{
+ +      struct vcpu_vmx *vmx = to_vmx(vcpu);
+ +      int max_irr;
+ +      void *vapic_page;
+ +      u16 status;
+ +
+ +      if (vmx->nested.pi_desc &&
+ +          vmx->nested.pi_pending) {
+ +              vmx->nested.pi_pending = false;
+ +              if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ +                      return 0;
+ +
+ +              max_irr = find_last_bit(
+ +                      (unsigned long *)vmx->nested.pi_desc->pir, 256);
+ +
+ +              if (max_irr == 256)
+ +                      return 0;
+ +
+ +              vapic_page = kmap(vmx->nested.virtual_apic_page);
+ +              if (!vapic_page) {
+ +                      WARN_ON(1);
+ +                      return -ENOMEM;
+ +              }
+ +              __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
+ +              kunmap(vmx->nested.virtual_apic_page);
+ +
+ +              status = vmcs_read16(GUEST_INTR_STATUS);
+ +              if ((u8)max_irr > ((u8)status & 0xff)) {
+ +                      status &= ~0xff;
+ +                      status |= (u8)max_irr;
+ +                      vmcs_write16(GUEST_INTR_STATUS, status);
+ +              }
+ +      }
+ +      return 0;
+ +}
+ +
+ +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+ +                                              int vector)
+ +{
+ +      struct vcpu_vmx *vmx = to_vmx(vcpu);
+ +
+ +      if (is_guest_mode(vcpu) &&
+ +          vector == vmx->nested.posted_intr_nv) {
+ +              /* the PIR and ON have been set by L1. */
+ +              if (vcpu->mode == IN_GUEST_MODE)
+ +                      apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+ +                              POSTED_INTR_VECTOR);
+ +              /*
+ +               * If a posted intr is not recognized by hardware,
+ +               * we will accomplish it in the next vmentry.
+ +               */
+ +              vmx->nested.pi_pending = true;
+ +              kvm_make_request(KVM_REQ_EVENT, vcpu);
+ +              return 0;
+ +      }
+ +      return -1;
+ +}
   /*
    * Send interrupt to vcpu via posted interrupt way.
    * 1. If target vcpu is running(non-root mode), send posted interrupt
@@@ -4400,10 -4209,6 +4400,10 @@@ static void vmx_deliver_posted_interrup
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         int r;
   
+ +      r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+ +      if (!r)
+ +              return;
+ +
         if (pi_test_and_set_pir(vector, &vmx->pi_desc))
                 return;
   
@@@ -4450,7 -4255,7 +4450,7 @@@ static void vmx_set_constant_host_state
         vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
   
         /* Save the most likely value for this task's CR4 in the VMCS. */
-       cr4 = read_cr4();
+       cr4 = cr4_read_shadow();
         vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
         vmx->host_state.vmcs_host_cr4 = cr4;
   
@@@ -4555,9 -4360,6 +4555,9 @@@ static u32 vmx_secondary_exec_control(s
            a current VMCS12
         */
         exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+ +      /* PML is enabled/disabled in creating/destorying vcpu */
+ +      exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+ +
         return exec_control;
   }
   
@@@ -5184,12 -4986,11 +5184,12 @@@ vmx_patch_hypercall(struct kvm_vcpu *vc
         hypercall[2] = 0xc1;
   }
   
- -static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+ +static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
   {
         unsigned long always_on = VMXON_CR0_ALWAYSON;
+ +      struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
   
- -      if (nested_vmx_secondary_ctls_high &
+ +      if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
                 SECONDARY_EXEC_UNRESTRICTED_GUEST &&
             nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
                 always_on &= ~(X86_CR0_PE | X86_CR0_PG);
@@@ -5214,7 -5015,7 +5214,7 @@@ static int handle_set_cr0(struct kvm_vc
                 val = (val & ~vmcs12->cr0_guest_host_mask) |
                         (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
   
- -              if (!nested_cr0_valid(vmcs12, val))
+ +              if (!nested_cr0_valid(vcpu, val))
                         return 1;
   
                 if (kvm_set_cr0(vcpu, val))
@@@ -6016,21 -5817,13 +6016,21 @@@ static __init int hardware_setup(void
                                 (unsigned long *)__get_free_page(GFP_KERNEL);
         if (!vmx_msr_bitmap_longmode_x2apic)
                 goto out4;
+ +
+ +      if (nested) {
+ +              vmx_msr_bitmap_nested =
+ +                      (unsigned long *)__get_free_page(GFP_KERNEL);
+ +              if (!vmx_msr_bitmap_nested)
+ +                      goto out5;
+ +      }
+ +
         vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
         if (!vmx_vmread_bitmap)
- -              goto out5;
+ +              goto out6;
   
         vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
         if (!vmx_vmwrite_bitmap)
- -              goto out6;
+ +              goto out7;
   
         memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
         memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@@ -6046,12 -5839,10 +6046,12 @@@
   
         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+ +      if (nested)
+ +              memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
   
         if (setup_vmcs_config(&vmcs_config) < 0) {
                 r = -EIO;
- -              goto out7;
+ +              goto out8;
         }
   
         if (boot_cpu_has(X86_FEATURE_NX))
@@@ -6077,16 -5868,16 +6077,16 @@@
         if (!cpu_has_vmx_unrestricted_guest())
                 enable_unrestricted_guest = 0;
   
- -      if (!cpu_has_vmx_flexpriority()) {
+ +      if (!cpu_has_vmx_flexpriority())
                 flexpriority_enabled = 0;
   
- -              /*
- -               * set_apic_access_page_addr() is used to reload apic access
- -               * page upon invalidation.  No need to do anything if the
- -               * processor does not have the APIC_ACCESS_ADDR VMCS field.
- -               */
+ +      /*
+ +       * set_apic_access_page_addr() is used to reload apic access
+ +       * page upon invalidation.  No need to do anything if not
+ +       * using the APIC_ACCESS_ADDR VMCS field.
+ +       */
+ +      if (!flexpriority_enabled)
                 kvm_x86_ops->set_apic_access_page_addr = NULL;
- -      }
   
         if (!cpu_has_vmx_tpr_shadow())
                 kvm_x86_ops->update_cr8_intercept = NULL;
@@@ -6104,11 -5895,13 +6104,11 @@@
                 kvm_x86_ops->update_cr8_intercept = NULL;
         else {
                 kvm_x86_ops->hwapic_irr_update = NULL;
+ +              kvm_x86_ops->hwapic_isr_update = NULL;
                 kvm_x86_ops->deliver_posted_interrupt = NULL;
                 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
         }
   
- -      if (nested)
- -              nested_vmx_setup_ctls_msrs();
- -
         vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
         vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
         vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@@ -6152,29 -5945,12 +6152,29 @@@
   
         update_ple_window_actual_max();
   
+ +      /*
+ +       * Only enable PML when hardware supports PML feature, and both EPT
+ +       * and EPT A/D bit features are enabled -- PML depends on them to work.
+ +       */
+ +      if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
+ +              enable_pml = 0;
+ +
+ +      if (!enable_pml) {
+ +              kvm_x86_ops->slot_enable_log_dirty = NULL;
+ +              kvm_x86_ops->slot_disable_log_dirty = NULL;
+ +              kvm_x86_ops->flush_log_dirty = NULL;
+ +              kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+ +      }
+ +
         return alloc_kvm_area();
   
- -out7:
+ +out8:
         free_page((unsigned long)vmx_vmwrite_bitmap);
- -out6:
+ +out7:
         free_page((unsigned long)vmx_vmread_bitmap);
+ +out6:
+ +      if (nested)
+ +              free_page((unsigned long)vmx_msr_bitmap_nested);
   out5:
         free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
   out4:
@@@ -6201,8 -5977,6 +6201,8 @@@ static __exit void hardware_unsetup(voi
         free_page((unsigned long)vmx_io_bitmap_a);
         free_page((unsigned long)vmx_vmwrite_bitmap);
         free_page((unsigned long)vmx_vmread_bitmap);
+ +      if (nested)
+ +              free_page((unsigned long)vmx_msr_bitmap_nested);
   
         free_kvm_area();
   }
@@@ -6369,13 -6143,6 +6369,13 @@@ static void nested_vmx_failValid(struc
          */
   }
   
+ +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+ +{
+ +      /* TODO: not to reset guest simply here. */
+ +      kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ +      pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+ +}
+ +
   static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
   {
         struct vcpu_vmx *vmx =
@@@ -6665,7 -6432,6 +6665,7 @@@ static inline void nested_release_vmcs1
                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
                 vmcs_write64(VMCS_LINK_POINTER, -1ull);
         }
+ +      vmx->nested.posted_intr_nv = -1;
         kunmap(vmx->nested.current_vmcs12_page);
         nested_release_page(vmx->nested.current_vmcs12_page);
         vmx->nested.current_vmptr = -1ull;
@@@ -6694,12 -6460,6 +6694,12 @@@ static void free_nested(struct vcpu_vm
                 nested_release_page(vmx->nested.virtual_apic_page);
                 vmx->nested.virtual_apic_page = NULL;
         }
+ +      if (vmx->nested.pi_desc_page) {
+ +              kunmap(vmx->nested.pi_desc_page);
+ +              nested_release_page(vmx->nested.pi_desc_page);
+ +              vmx->nested.pi_desc_page = NULL;
+ +              vmx->nested.pi_desc = NULL;
+ +      }
   
         nested_free_all_saved_vmcss(vmx);
   }
@@@ -7133,7 -6893,6 +7133,7 @@@ static int handle_vmptrst(struct kvm_vc
   /* Emulate the INVEPT instruction */
   static int handle_invept(struct kvm_vcpu *vcpu)
   {
+ +      struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 vmx_instruction_info, types;
         unsigned long type;
         gva_t gva;
@@@ -7142,9 -6901,8 +7142,9 @@@
                 u64 eptp, gpa;
         } operand;
   
- -      if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
- -          !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+ +      if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ +            SECONDARY_EXEC_ENABLE_EPT) ||
+ +          !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
                 kvm_queue_exception(vcpu, UD_VECTOR);
                 return 1;
         }
@@@ -7160,7 -6918,7 +7160,7 @@@
         vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
         type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
   
- -      types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+ +      types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
   
         if (!(types & (1UL << type))) {
                 nested_vmx_failValid(vcpu,
@@@ -7202,31 -6960,6 +7202,31 @@@ static int handle_invvpid(struct kvm_vc
         return 1;
   }
   
+ +static int handle_pml_full(struct kvm_vcpu *vcpu)
+ +{
+ +      unsigned long exit_qualification;
+ +
+ +      trace_kvm_pml_full(vcpu->vcpu_id);
+ +
+ +      exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ +
+ +      /*
+ +       * PML buffer FULL happened while executing iret from NMI,
+ +       * "blocked by NMI" bit has to be set before next VM entry.
+ +       */
+ +      if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ +                      cpu_has_virtual_nmis() &&
+ +                      (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ +              vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ +                              GUEST_INTR_STATE_NMI);
+ +
+ +      /*
+ +       * PML buffer already flushed at beginning of VMEXIT. Nothing to do
+ +       * here.., and there's no userspace involvement needed for PML.
+ +       */
+ +      return 1;
+ +}
+ +
   /*
    * The exit handlers return 1 if the exit was handled fully and guest execution
    * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@@ -7275,7 -7008,6 +7275,7 @@@ static int (*const kvm_vmx_exit_handler
         [EXIT_REASON_INVVPID]                 = handle_invvpid,
         [EXIT_REASON_XSAVES]                  = handle_xsaves,
         [EXIT_REASON_XRSTORS]                 = handle_xrstors,
+ +      [EXIT_REASON_PML_FULL]                = handle_pml_full,
   };
   
   static const int kvm_vmx_max_exit_handlers =
@@@ -7543,10 -7275,6 +7543,10 @@@ static bool nested_vmx_exit_handled(str
         case EXIT_REASON_APIC_ACCESS:
                 return nested_cpu_has2(vmcs12,
                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ +      case EXIT_REASON_APIC_WRITE:
+ +      case EXIT_REASON_EOI_INDUCED:
+ +              /* apic_write and eoi_induced should exit unconditionally. */
+ +              return 1;
         case EXIT_REASON_EPT_VIOLATION:
                 /*
                  * L0 always deals with the EPT violation. If nested EPT is
@@@ -7586,89 -7314,6 +7586,89 @@@ static void vmx_get_exit_info(struct kv
         *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
   }
   
+ +static int vmx_enable_pml(struct vcpu_vmx *vmx)
+ +{
+ +      struct page *pml_pg;
+ +      u32 exec_control;
+ +
+ +      pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ +      if (!pml_pg)
+ +              return -ENOMEM;
+ +
+ +      vmx->pml_pg = pml_pg;
+ +
+ +      vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ +      vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ +
+ +      exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ +      exec_control |= SECONDARY_EXEC_ENABLE_PML;
+ +      vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ +
+ +      return 0;
+ +}
+ +
+ +static void vmx_disable_pml(struct vcpu_vmx *vmx)
+ +{
+ +      u32 exec_control;
+ +
+ +      ASSERT(vmx->pml_pg);
+ +      __free_page(vmx->pml_pg);
+ +      vmx->pml_pg = NULL;
+ +
+ +      exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ +      exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+ +      vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ +}
+ +
+ +static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
+ +{
+ +      struct kvm *kvm = vmx->vcpu.kvm;
+ +      u64 *pml_buf;
+ +      u16 pml_idx;
+ +
+ +      pml_idx = vmcs_read16(GUEST_PML_INDEX);
+ +
+ +      /* Do nothing if PML buffer is empty */
+ +      if (pml_idx == (PML_ENTITY_NUM - 1))
+ +              return;
+ +
+ +      /* PML index always points to next available PML buffer entity */
+ +      if (pml_idx >= PML_ENTITY_NUM)
+ +              pml_idx = 0;
+ +      else
+ +              pml_idx++;
+ +
+ +      pml_buf = page_address(vmx->pml_pg);
+ +      for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+ +              u64 gpa;
+ +
+ +              gpa = pml_buf[pml_idx];
+ +              WARN_ON(gpa & (PAGE_SIZE - 1));
+ +              mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+ +      }
+ +
+ +      /* reset PML index */
+ +      vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ +}
+ +
+ +/*
+ + * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
+ + * Called before reporting dirty_bitmap to userspace.
+ + */
+ +static void kvm_flush_pml_buffers(struct kvm *kvm)
+ +{
+ +      int i;
+ +      struct kvm_vcpu *vcpu;
+ +      /*
+ +       * We only need to kick vcpu out of guest mode here, as PML buffer
+ +       * is flushed at beginning of all VMEXITs, and it's obvious that only
+ +       * vcpus running in guest are possible to have unflushed GPAs in PML
+ +       * buffer.
+ +       */
+ +      kvm_for_each_vcpu(i, vcpu, kvm)
+ +              kvm_vcpu_kick(vcpu);
+ +}
+ +
   /*
    * The guest has exited.  See if we can fix it or if we need userspace
    * assistance.
@@@ -7679,16 -7324,6 +7679,16 @@@ static int vmx_handle_exit(struct kvm_v
         u32 exit_reason = vmx->exit_reason;
         u32 vectoring_info = vmx->idt_vectoring_info;
   
+ +      /*
+ +       * Flush logged GPAs PML buffer, this will make dirty_bitmap more
+ +       * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
+ +       * querying dirty_bitmap, we only need to kick all vcpus out of guest
+ +       * mode as if vcpus is in root mode, the PML buffer must has been
+ +       * flushed already.
+ +       */
+ +      if (enable_pml)
+ +              vmx_flush_pml_buffer(vmx);
+ +
         /* If guest state is invalid, start emulating */
         if (vmx->emulation_required)
                 return handle_invalid_guest_state(vcpu);
@@@ -7836,6 -7471,9 +7836,6 @@@ static void vmx_hwapic_isr_update(struc
         u16 status;
         u8 old;
   
- -      if (!vmx_vm_has_apicv(kvm))
- -              return;
- -
         if (isr == -1)
                 isr = 0;
   
@@@ -8146,7 -7784,7 +8146,7 @@@ static void __noclone vmx_vcpu_run(stru
         if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
   
-       cr4 = read_cr4();
+       cr4 = cr4_read_shadow();
         if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
                 vmcs_writel(HOST_CR4, cr4);
                 vmx->host_state.vmcs_host_cr4 = cr4;
@@@ -8335,8 -7973,6 +8335,8 @@@ static void vmx_free_vcpu(struct kvm_vc
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
   
+ +      if (enable_pml)
+ +              vmx_disable_pml(vmx);
         free_vpid(vmx);
         leave_guest_mode(vcpu);
         vmx_load_vmcs01(vcpu);
@@@ -8404,25 -8040,9 +8404,25 @@@ static struct kvm_vcpu *vmx_create_vcpu
                         goto free_vmcs;
         }
   
+ +      if (nested)
+ +              nested_vmx_setup_ctls_msrs(vmx);
+ +
+ +      vmx->nested.posted_intr_nv = -1;
         vmx->nested.current_vmptr = -1ull;
         vmx->nested.current_vmcs12 = NULL;
   
+ +      /*
+ +       * If PML is turned on, failure on enabling PML just results in failure
+ +       * of creating the vcpu, therefore we can simplify PML logic (by
+ +       * avoiding dealing with cases, such as enabling PML partially on vcpus
+ +       * for the guest, etc.
+ +       */
+ +      if (enable_pml) {
+ +              err = vmx_enable_pml(vmx);
+ +              if (err)
+ +                      goto free_vmcs;
+ +      }
+ +
         return &vmx->vcpu;
   
   free_vmcs:
@@@ -8564,10 -8184,9 +8564,10 @@@ static unsigned long nested_ept_get_cr3
   
   static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
   {
- -      kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
- -                      nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
- -
+ +      WARN_ON(mmu_is_nested(vcpu));
+ +      kvm_init_shadow_ept_mmu(vcpu,
+ +                      to_vmx(vcpu)->nested.nested_vmx_ept_caps &
+ +                      VMX_EPT_EXECUTE_ONLY_BIT);
         vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
         vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
         vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@@ -8580,18 -8199,6 +8580,18 @@@ static void nested_ept_uninit_mmu_conte
         vcpu->arch.walk_mmu = &vcpu->arch.mmu;
   }
   
+ +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ +                                          u16 error_code)
+ +{
+ +      bool inequality, bit;
+ +
+ +      bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+ +      inequality =
+ +              (error_code & vmcs12->page_fault_error_code_mask) !=
+ +               vmcs12->page_fault_error_code_match;
+ +      return inequality ^ bit;
+ +}
+ +
   static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
                 struct x86_exception *fault)
   {
@@@ -8599,7 -8206,8 +8599,7 @@@
   
         WARN_ON(!is_guest_mode(vcpu));
   
- -      /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
- -      if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+ +      if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
                 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
                                   vmcs_read32(VM_EXIT_INTR_INFO),
                                   vmcs_readl(EXIT_QUALIFICATION));
@@@ -8653,31 -8261,6 +8653,31 @@@ static bool nested_get_vmcs12_pages(str
                         return false;
         }
   
+ +      if (nested_cpu_has_posted_intr(vmcs12)) {
+ +              if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+ +                      return false;
+ +
+ +              if (vmx->nested.pi_desc_page) { /* shouldn't happen */
+ +                      kunmap(vmx->nested.pi_desc_page);
+ +                      nested_release_page(vmx->nested.pi_desc_page);
+ +              }
+ +              vmx->nested.pi_desc_page =
+ +                      nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
+ +              if (!vmx->nested.pi_desc_page)
+ +                      return false;
+ +
+ +              vmx->nested.pi_desc =
+ +                      (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
+ +              if (!vmx->nested.pi_desc) {
+ +                      nested_release_page_clean(vmx->nested.pi_desc_page);
+ +                      return false;
+ +              }
+ +              vmx->nested.pi_desc =
+ +                      (struct pi_desc *)((void *)vmx->nested.pi_desc +
+ +                      (unsigned long)(vmcs12->posted_intr_desc_addr &
+ +                      (PAGE_SIZE - 1)));
+ +      }
+ +
         return true;
   }
   
@@@ -8703,310 -8286,6 +8703,310 @@@ static void vmx_start_preemption_timer(
                       ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
   }
   
+ +static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+ +                                              struct vmcs12 *vmcs12)
+ +{
+ +      int maxphyaddr;
+ +      u64 addr;
+ +
+ +      if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ +              return 0;
+ +
+ +      if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+ +              WARN_ON(1);
+ +              return -EINVAL;
+ +      }
+ +      maxphyaddr = cpuid_maxphyaddr(vcpu);
+ +
+ +      if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+ +         ((addr + PAGE_SIZE) >> maxphyaddr))
+ +              return -EINVAL;
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Merge L0's and L1's MSR bitmap, return false to indicate that
+ + * we do not use the hardware.
+ + */
+ +static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+ +                                             struct vmcs12 *vmcs12)
+ +{
+ +      int msr;
+ +      struct page *page;
+ +      unsigned long *msr_bitmap;
+ +
+ +      if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+ +              return false;
+ +
+ +      page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+ +      if (!page) {
+ +              WARN_ON(1);
+ +              return false;
+ +      }
+ +      msr_bitmap = (unsigned long *)kmap(page);
+ +      if (!msr_bitmap) {
+ +              nested_release_page_clean(page);
+ +              WARN_ON(1);
+ +              return false;
+ +      }
+ +
+ +      if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+ +              if (nested_cpu_has_apic_reg_virt(vmcs12))
+ +                      for (msr = 0x800; msr <= 0x8ff; msr++)
+ +                              nested_vmx_disable_intercept_for_msr(
+ +                                      msr_bitmap,
+ +                                      vmx_msr_bitmap_nested,
+ +                                      msr, MSR_TYPE_R);
+ +              /* TPR is allowed */
+ +              nested_vmx_disable_intercept_for_msr(msr_bitmap,
+ +                              vmx_msr_bitmap_nested,
+ +                              APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+ +                              MSR_TYPE_R | MSR_TYPE_W);
+ +              if (nested_cpu_has_vid(vmcs12)) {
+ +                      /* EOI and self-IPI are allowed */
+ +                      nested_vmx_disable_intercept_for_msr(
+ +                              msr_bitmap,
+ +                              vmx_msr_bitmap_nested,
+ +                              APIC_BASE_MSR + (APIC_EOI >> 4),
+ +                              MSR_TYPE_W);
+ +                      nested_vmx_disable_intercept_for_msr(
+ +                              msr_bitmap,
+ +                              vmx_msr_bitmap_nested,
+ +                              APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+ +                              MSR_TYPE_W);
+ +              }
+ +      } else {
+ +              /*
+ +               * Enable reading intercept of all the x2apic
+ +               * MSRs. We should not rely on vmcs12 to do any
+ +               * optimizations here, it may have been modified
+ +               * by L1.
+ +               */
+ +              for (msr = 0x800; msr <= 0x8ff; msr++)
+ +                      __vmx_enable_intercept_for_msr(
+ +                              vmx_msr_bitmap_nested,
+ +                              msr,
+ +                              MSR_TYPE_R);
+ +
+ +              __vmx_enable_intercept_for_msr(
+ +                              vmx_msr_bitmap_nested,
+ +                              APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+ +                              MSR_TYPE_W);
+ +              __vmx_enable_intercept_for_msr(
+ +                              vmx_msr_bitmap_nested,
+ +                              APIC_BASE_MSR + (APIC_EOI >> 4),
+ +                              MSR_TYPE_W);
+ +              __vmx_enable_intercept_for_msr(
+ +                              vmx_msr_bitmap_nested,
+ +                              APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+ +                              MSR_TYPE_W);
+ +      }
+ +      kunmap(page);
+ +      nested_release_page_clean(page);
+ +
+ +      return true;
+ +}
+ +
+ +static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+ +                                         struct vmcs12 *vmcs12)
+ +{
+ +      if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ +          !nested_cpu_has_apic_reg_virt(vmcs12) &&
+ +          !nested_cpu_has_vid(vmcs12) &&
+ +          !nested_cpu_has_posted_intr(vmcs12))
+ +              return 0;
+ +
+ +      /*
+ +       * If virtualize x2apic mode is enabled,
+ +       * virtualize apic access must be disabled.
+ +       */
+ +      if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ +          nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ +              return -EINVAL;
+ +
+ +      /*
+ +       * If virtual interrupt delivery is enabled,
+ +       * we must exit on external interrupts.
+ +       */
+ +      if (nested_cpu_has_vid(vmcs12) &&
+ +         !nested_exit_on_intr(vcpu))
+ +              return -EINVAL;
+ +
+ +      /*
+ +       * bits 15:8 should be zero in posted_intr_nv,
+ +       * the descriptor address has been already checked
+ +       * in nested_get_vmcs12_pages.
+ +       */
+ +      if (nested_cpu_has_posted_intr(vmcs12) &&
+ +         (!nested_cpu_has_vid(vmcs12) ||
+ +          !nested_exit_intr_ack_set(vcpu) ||
+ +          vmcs12->posted_intr_nv & 0xff00))
+ +              return -EINVAL;
+ +
+ +      /* tpr shadow is needed by all apicv features. */
+ +      if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ +              return -EINVAL;
+ +
+ +      return 0;
+ +}
+ +
+ +static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+ +                                     unsigned long count_field,
+ +                                     unsigned long addr_field,
+ +                                     int maxphyaddr)
+ +{
+ +      u64 count, addr;
+ +
+ +      if (vmcs12_read_any(vcpu, count_field, &count) ||
+ +          vmcs12_read_any(vcpu, addr_field, &addr)) {
+ +              WARN_ON(1);
+ +              return -EINVAL;
+ +      }
+ +      if (count == 0)
+ +              return 0;
+ +      if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
+ +          (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
+ +              pr_warn_ratelimited(
+ +                      "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
+ +                      addr_field, maxphyaddr, count, addr);
+ +              return -EINVAL;
+ +      }
+ +      return 0;
+ +}
+ +
+ +static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+ +                                              struct vmcs12 *vmcs12)
+ +{
+ +      int maxphyaddr;
+ +
+ +      if (vmcs12->vm_exit_msr_load_count == 0 &&
+ +          vmcs12->vm_exit_msr_store_count == 0 &&
+ +          vmcs12->vm_entry_msr_load_count == 0)
+ +              return 0; /* Fast path */
+ +      maxphyaddr = cpuid_maxphyaddr(vcpu);
+ +      if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+ +                                      VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+ +          nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+ +                                      VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+ +          nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+ +                                      VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+ +              return -EINVAL;
+ +      return 0;
+ +}
+ +
+ +static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+ +                                     struct vmx_msr_entry *e)
+ +{
+ +      /* x2APIC MSR accesses are not allowed */
+ +      if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+ +              return -EINVAL;
+ +      if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
+ +          e->index == MSR_IA32_UCODE_REV)
+ +              return -EINVAL;
+ +      if (e->reserved != 0)
+ +              return -EINVAL;
+ +      return 0;
+ +}
+ +
+ +static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+ +                                   struct vmx_msr_entry *e)
+ +{
+ +      if (e->index == MSR_FS_BASE ||
+ +          e->index == MSR_GS_BASE ||
+ +          e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+ +          nested_vmx_msr_check_common(vcpu, e))
+ +              return -EINVAL;
+ +      return 0;
+ +}
+ +
+ +static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+ +                                    struct vmx_msr_entry *e)
+ +{
+ +      if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+ +          nested_vmx_msr_check_common(vcpu, e))
+ +              return -EINVAL;
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Load guest's/host's msr at nested entry/exit.
+ + * return 0 for success, entry index for failure.
+ + */
+ +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+ +{
+ +      u32 i;
+ +      struct vmx_msr_entry e;
+ +      struct msr_data msr;
+ +
+ +      msr.host_initiated = false;
+ +      for (i = 0; i < count; i++) {
+ +              if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
+ +                                 &e, sizeof(e))) {
+ +                      pr_warn_ratelimited(
+ +                              "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ +                              __func__, i, gpa + i * sizeof(e));
+ +                      goto fail;
+ +              }
+ +              if (nested_vmx_load_msr_check(vcpu, &e)) {
+ +                      pr_warn_ratelimited(
+ +                              "%s check failed (%u, 0x%x, 0x%x)\n",
+ +                              __func__, i, e.index, e.reserved);
+ +                      goto fail;
+ +              }
+ +              msr.index = e.index;
+ +              msr.data = e.value;
+ +              if (kvm_set_msr(vcpu, &msr)) {
+ +                      pr_warn_ratelimited(
+ +                              "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ +                              __func__, i, e.index, e.value);
+ +                      goto fail;
+ +              }
+ +      }
+ +      return 0;
+ +fail:
+ +      return i + 1;
+ +}
+ +
+ +static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+ +{
+ +      u32 i;
+ +      struct vmx_msr_entry e;
+ +
+ +      for (i = 0; i < count; i++) {
+ +              if (kvm_read_guest(vcpu->kvm,
+ +                                 gpa + i * sizeof(e),
+ +                                 &e, 2 * sizeof(u32))) {
+ +                      pr_warn_ratelimited(
+ +                              "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ +                              __func__, i, gpa + i * sizeof(e));
+ +                      return -EINVAL;
+ +              }
+ +              if (nested_vmx_store_msr_check(vcpu, &e)) {
+ +                      pr_warn_ratelimited(
+ +                              "%s check failed (%u, 0x%x, 0x%x)\n",
+ +                              __func__, i, e.index, e.reserved);
+ +                      return -EINVAL;
+ +              }
+ +              if (kvm_get_msr(vcpu, e.index, &e.value)) {
+ +                      pr_warn_ratelimited(
+ +                              "%s cannot read MSR (%u, 0x%x)\n",
+ +                              __func__, i, e.index);
+ +                      return -EINVAL;
+ +              }
+ +              if (kvm_write_guest(vcpu->kvm,
+ +                                  gpa + i * sizeof(e) +
+ +                                      offsetof(struct vmx_msr_entry, value),
+ +                                  &e.value, sizeof(e.value))) {
+ +                      pr_warn_ratelimited(
+ +                              "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ +                              __func__, i, e.index, e.value);
+ +                      return -EINVAL;
+ +              }
+ +      }
+ +      return 0;
+ +}
+ +
   /*
    * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
    * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@@ -9086,23 -8365,8 +9086,23 @@@ static void prepare_vmcs02(struct kvm_v
   
         exec_control = vmcs12->pin_based_vm_exec_control;
         exec_control |= vmcs_config.pin_based_exec_ctrl;
- -      exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
- -                          PIN_BASED_POSTED_INTR);
+ +      exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ +
+ +      if (nested_cpu_has_posted_intr(vmcs12)) {
+ +              /*
+ +               * Note that we use L0's vector here and in
+ +               * vmx_deliver_nested_posted_interrupt.
+ +               */
+ +              vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+ +              vmx->nested.pi_pending = false;
+ +              vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ +              vmcs_write64(POSTED_INTR_DESC_ADDR,
+ +                      page_to_phys(vmx->nested.pi_desc_page) +
+ +                      (unsigned long)(vmcs12->posted_intr_desc_addr &
+ +                      (PAGE_SIZE - 1)));
+ +      } else
+ +              exec_control &= ~PIN_BASED_POSTED_INTR;
+ +
         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
   
         vmx->nested.preemption_timer_expired = false;
@@@ -9159,26 -8423,12 +9159,26 @@@
                         else
                                 vmcs_write64(APIC_ACCESS_ADDR,
                                   page_to_phys(vmx->nested.apic_access_page));
- -              } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
+ +              } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+ +                          (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
                         exec_control |=
                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
                         kvm_vcpu_reload_apic_access_page(vcpu);
                 }
   
+ +              if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+ +                      vmcs_write64(EOI_EXIT_BITMAP0,
+ +                              vmcs12->eoi_exit_bitmap0);
+ +                      vmcs_write64(EOI_EXIT_BITMAP1,
+ +                              vmcs12->eoi_exit_bitmap1);
+ +                      vmcs_write64(EOI_EXIT_BITMAP2,
+ +                              vmcs12->eoi_exit_bitmap2);
+ +                      vmcs_write64(EOI_EXIT_BITMAP3,
+ +                              vmcs12->eoi_exit_bitmap3);
+ +                      vmcs_write16(GUEST_INTR_STATUS,
+ +                              vmcs12->guest_intr_status);
+ +              }
+ +
                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
         }
   
@@@ -9212,17 -8462,11 +9212,17 @@@
                 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
         }
   
+ +      if (cpu_has_vmx_msr_bitmap() &&
+ +          exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+ +          nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+ +              vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+ +      } else
+ +              exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+ +
         /*
- -       * Merging of IO and MSR bitmaps not currently supported.
+ +       * Merging of IO bitmap not currently supported.
          * Rather, exit every time.
          */
- -      exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
         exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
         exec_control |= CPU_BASED_UNCOND_IO_EXITING;
   
@@@ -9338,7 -8582,6 +9338,7 @@@ static int nested_vmx_run(struct kvm_vc
         int cpu;
         struct loaded_vmcs *vmcs02;
         bool ia32e;
+ +      u32 msr_entry_idx;
   
         if (!nested_vmx_check_permission(vcpu) ||
             !nested_vmx_check_vmcs12(vcpu))
@@@ -9373,42 -8616,41 +9373,42 @@@
                 return 1;
         }
   
- -      if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
- -                      !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
+ +      if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
                 /*TODO: Also verify bits beyond physical address width are 0*/
                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                 return 1;
         }
   
- -      if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
- -              /*TODO: Also verify bits beyond physical address width are 0*/
+ +      if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                 return 1;
         }
   
- -      if (vmcs12->vm_entry_msr_load_count > 0 ||
- -          vmcs12->vm_exit_msr_load_count > 0 ||
- -          vmcs12->vm_exit_msr_store_count > 0) {
- -              pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
- -                                  __func__);
+ +      if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
+ +              nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ +              return 1;
+ +      }
+ +
+ +      if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                 return 1;
         }
   
         if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- -                              nested_vmx_true_procbased_ctls_low,
- -                              nested_vmx_procbased_ctls_high) ||
+ +                              vmx->nested.nested_vmx_true_procbased_ctls_low,
+ +                              vmx->nested.nested_vmx_procbased_ctls_high) ||
             !vmx_control_verify(vmcs12->secondary_vm_exec_control,
- -            nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+ +                              vmx->nested.nested_vmx_secondary_ctls_low,
+ +                              vmx->nested.nested_vmx_secondary_ctls_high) ||
             !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
- -            nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+ +                              vmx->nested.nested_vmx_pinbased_ctls_low,
+ +                              vmx->nested.nested_vmx_pinbased_ctls_high) ||
             !vmx_control_verify(vmcs12->vm_exit_controls,
- -                              nested_vmx_true_exit_ctls_low,
- -                              nested_vmx_exit_ctls_high) ||
+ +                              vmx->nested.nested_vmx_true_exit_ctls_low,
+ +                              vmx->nested.nested_vmx_exit_ctls_high) ||
             !vmx_control_verify(vmcs12->vm_entry_controls,
- -                              nested_vmx_true_entry_ctls_low,
- -                              nested_vmx_entry_ctls_high))
+ +                              vmx->nested.nested_vmx_true_entry_ctls_low,
+ +                              vmx->nested.nested_vmx_entry_ctls_high))
         {
                 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                 return 1;
@@@ -9421,7 -8663,7 +9421,7 @@@
                 return 1;
         }
   
- -      if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
+ +      if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
             ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
                 nested_vmx_entry_failure(vcpu, vmcs12,
                         EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@@ -9497,21 -8739,10 +9497,21 @@@
   
         vmx_segment_cache_clear(vmx);
   
- -      vmcs12->launch_state = 1;
- -
         prepare_vmcs02(vcpu, vmcs12);
   
+ +      msr_entry_idx = nested_vmx_load_msr(vcpu,
+ +                                          vmcs12->vm_entry_msr_load_addr,
+ +                                          vmcs12->vm_entry_msr_load_count);
+ +      if (msr_entry_idx) {
+ +              leave_guest_mode(vcpu);
+ +              vmx_load_vmcs01(vcpu);
+ +              nested_vmx_entry_failure(vcpu, vmcs12,
+ +                              EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+ +              return 1;
+ +      }
+ +
+ +      vmcs12->launch_state = 1;
+ +
         if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
                 return kvm_emulate_halt(vcpu);
   
@@@ -9638,10 -8869,9 +9638,10 @@@ static int vmx_check_nested_events(stru
                 if (vmx->nested.nested_run_pending)
                         return -EBUSY;
                 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ +              return 0;
         }
   
- -      return 0;
+ +      return vmx_complete_nested_posted_interrupt(vcpu);
   }
   
   static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@@ -9751,9 -8981,6 +9751,9 @@@ static void prepare_vmcs12(struct kvm_v
                 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
         }
   
+ +      if (nested_cpu_has_vid(vmcs12))
+ +              vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
+ +
         vmcs12->vm_entry_controls =
                 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
                 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
@@@ -9945,13 -9172,6 +9945,13 @@@ static void load_vmcs12_host_state(stru
   
         kvm_set_dr(vcpu, 7, 0x400);
         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ +
+ +      if (cpu_has_vmx_msr_bitmap())
+ +              vmx_set_msr_bitmap(vcpu);
+ +
+ +      if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ +                              vmcs12->vm_exit_msr_load_count))
+ +              nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
   }
   
   /*
@@@ -9973,10 -9193,6 +9973,10 @@@ static void nested_vmx_vmexit(struct kv
         prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
                        exit_qualification);
   
+ +      if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+ +                               vmcs12->vm_exit_msr_store_count))
+ +              nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+ +
         vmx_load_vmcs01(vcpu);
   
         if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@@ -10019,12 -9235,6 +10019,12 @@@
                 nested_release_page(vmx->nested.virtual_apic_page);
                 vmx->nested.virtual_apic_page = NULL;
         }
+ +      if (vmx->nested.pi_desc_page) {
+ +              kunmap(vmx->nested.pi_desc_page);
+ +              nested_release_page(vmx->nested.pi_desc_page);
+ +              vmx->nested.pi_desc_page = NULL;
+ +              vmx->nested.pi_desc = NULL;
+ +      }
   
         /*
          * We are now running in L2, mmu_notifier will force to reload the
@@@ -10091,31 -9301,6 +10091,31 @@@ static void vmx_sched_in(struct kvm_vcp
                 shrink_ple_window(vcpu);
   }
   
+ +static void vmx_slot_enable_log_dirty(struct kvm *kvm,
+ +                                   struct kvm_memory_slot *slot)
+ +{
+ +      kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+ +      kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+ +}
+ +
+ +static void vmx_slot_disable_log_dirty(struct kvm *kvm,
+ +                                     struct kvm_memory_slot *slot)
+ +{
+ +      kvm_mmu_slot_set_dirty(kvm, slot);
+ +}
+ +
+ +static void vmx_flush_log_dirty(struct kvm *kvm)
+ +{
+ +      kvm_flush_pml_buffers(kvm);
+ +}
+ +
+ +static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
+ +                                         struct kvm_memory_slot *memslot,
+ +                                         gfn_t offset, unsigned long mask)
+ +{
+ +      kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+ +}
+ +
   static struct kvm_x86_ops vmx_x86_ops = {
         .cpu_has_kvm_support = cpu_has_kvm_support,
         .disabled_by_bios = vmx_disabled_by_bios,
@@@ -10224,11 -9409,6 +10224,11 @@@
         .check_nested_events = vmx_check_nested_events,
   
         .sched_in = vmx_sched_in,
+ +
+ +      .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
+ +      .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
+ +      .flush_log_dirty = vmx_flush_log_dirty,
+ +      .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
   };
   
   static int __init vmx_init(void)
diff --combined arch/x86/mm/init.c

index 649da47d382706b1001a403daac284b833fc7202,a74aa0fd185332a110eb2d7b04607c0fabd4dc30..553c094b9cd7984b7334a95122931a93249f1ddf
--- 1/arch/x86/mm/init.c
--- 2/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@@ -173,11 -173,11 +173,11 @@@ static void __init probe_page_size_mask
   
         /* Enable PSE if available */
         if (cpu_has_pse)
-               set_in_cr4(X86_CR4_PSE);
+               cr4_set_bits_and_update_boot(X86_CR4_PSE);
   
         /* Enable PGE if available */
         if (cpu_has_pge) {
-               set_in_cr4(X86_CR4_PGE);
+               cr4_set_bits_and_update_boot(X86_CR4_PGE);
                 __supported_pte_mask |= _PAGE_GLOBAL;
         }
   }
@@@ -608,7 -608,7 +608,7 @@@ void __init init_mem_mapping(void
    *
    *
    * On x86, access has to be given to the first megabyte of ram because that area
- - * contains bios code and data regions used by X and dosemu and similar apps.
+ + * contains BIOS code and data regions used by X and dosemu and similar apps.
    * Access has to be given to non-kernel-ram areas as well, these contain the PCI
    * mmio resources as well as potential bios/acpi data regions.
    */
@@@ -713,6 -713,15 +713,15 @@@ void __init zone_sizes_init(void
         free_area_init_nodes(max_zone_pfns);
   }
   
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+ #ifdef CONFIG_SMP
+       .active_mm = &init_mm,
+       .state = 0,
+ #endif
+       .cr4 = ~0UL,    /* fail hard if we screw up cr4 shadow initialization */
+ };
+ EXPORT_SYMBOL_GPL(cpu_tlbstate);
+ 
   void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
   {
         /* entry 0 MUST be WB (hardwired to speed up translations) */
diff --combined include/linux/perf_event.h

index 2cdc9d422bed9245bef3e0e62a007efad80be6c1,33262004c31041c69b0706eb0b07bf37931b83b8..2b621982938d4653436b3d5cfe7578c1c71e6bb9
--- 1/include/linux/perf_event.h
--- 2/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@@ -202,6 -202,13 +202,13 @@@ struct pmu 
          */
         int (*event_init)               (struct perf_event *event);
   
+       /*
+        * Notification that the event was mapped or unmapped.  Called
+        * in the context of the mapping task.
+        */
+       void (*event_mapped)            (struct perf_event *event); /*optional*/
+       void (*event_unmapped)          (struct perf_event *event); /*optional*/
+ 
   #define PERF_EF_START 0x01            /* start the counter when adding    */
   #define PERF_EF_RELOAD        0x02            /* reload the counter when starting */
   #define PERF_EF_UPDATE        0x04            /* update the counter when stopping */
@@@ -907,22 -914,12 +914,22 @@@ struct perf_pmu_events_attr 
         const char *event_str;
   };
   
+ +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+ +                            char *page);
+ +
   #define PMU_EVENT_ATTR(_name, _var, _id, _show)                               \
   static struct perf_pmu_events_attr _var = {                           \
         .attr = __ATTR(_name, 0444, _show, NULL),                       \
         .id   =  _id,                                                   \
   };
   
+ +#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                          \
+ +static struct perf_pmu_events_attr _var = {                               \
+ +      .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+ +      .id             = 0,                                                \
+ +      .event_str      = _str,                                             \
+ +};
+ +
   #define PMU_FORMAT_ATTR(_name, _format)                                       \
   static ssize_t                                                                \
   _name##_show(struct device *dev,                                      \
diff --combined kernel/events/core.c

index 8812d8e35f5b03b13e148ff67ae33453cad306c1,13209a90b751d11ba9e7d05334997916f7e5c609..f04daabfd1cffb78856e03b634b9d7c914faf4d1
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -4101,7 -4101,8 +4101,8 @@@ unlock
         rcu_read_unlock();
   }
   
- void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+ void __weak arch_perf_update_userpage(
+       struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
   {
   }
   
@@@ -4151,7 -4152,7 +4152,7 @@@ void perf_event_update_userpage(struct 
         userpg->time_running = running +
                         atomic64_read(&event->child_total_time_running);
   
-       arch_perf_update_userpage(userpg, now);
+       arch_perf_update_userpage(event, userpg, now);
   
         barrier();
         ++userpg->lock;
@@@ -4293,6 -4294,9 +4294,9 @@@ static void perf_mmap_open(struct vm_ar
   
         atomic_inc(&event->mmap_count);
         atomic_inc(&event->rb->mmap_count);
+ 
+       if (event->pmu->event_mapped)
+               event->pmu->event_mapped(event);
   }
   
   /*
@@@ -4312,6 -4316,9 +4316,9 @@@ static void perf_mmap_close(struct vm_a
         int mmap_locked = rb->mmap_locked;
         unsigned long size = perf_data_size(rb);
   
+       if (event->pmu->event_unmapped)
+               event->pmu->event_unmapped(event);
+ 
         atomic_dec(&rb->mmap_count);
   
         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
@@@ -4513,6 -4520,9 +4520,9 @@@ unlock
         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
         vma->vm_ops = &perf_mmap_vmops;
   
+       if (event->pmu->event_mapped)
+               event->pmu->event_mapped(event);
+ 
         return ret;
   }
   
@@@ -8508,18 -8518,6 +8518,18 @@@ void __init perf_event_init(void
                      != 1024);
   }
   
+ +ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+ +                            char *page)
+ +{
+ +      struct perf_pmu_events_attr *pmu_attr =
+ +              container_of(attr, struct perf_pmu_events_attr, attr);
+ +
+ +      if (pmu_attr->event_str)
+ +              return sprintf(page, "%s\n", pmu_attr->event_str);
+ +
+ +      return 0;
+ +}
+ +
   static int __init perf_event_sysfs_init(void)
   {
         struct pmu *pmu;
author	Linus Torvalds <[email protected]>
	Mon, 16 Feb 2015 22:58:12 +0000 (14:58 -0800)
committer	Linus Torvalds <[email protected]>
	Mon, 16 Feb 2015 22:58:12 +0000 (14:58 -0800)
		1	2
arch/x86/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mcheck/mce.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/head64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/i387.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/perf_event.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history