Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

author Linus Torvalds <[email protected]>

Wed, 18 Jul 2018 18:08:44 +0000 (11:08 -0700)

committer Linus Torvalds <[email protected]>

Wed, 18 Jul 2018 18:08:44 +0000 (11:08 -0700)
author Linus Torvalds <[email protected]>
Wed, 18 Jul 2018 18:08:44 +0000 (11:08 -0700)
committer Linus Torvalds <[email protected]>
Wed, 18 Jul 2018 18:08:44 +0000 (11:08 -0700)
diff --combined arch/x86/kernel/kvmclock.c

index bf8d1eb7fca3d97976b7747f49a5e5d77d18edde,4c53d12ca933824126563f8babf31fd35cfe705b..3b8e7c13c614a41fcf4533bd840630c4e3912d8f
--- 1/arch/x86/kernel/kvmclock.c
--- 2/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@@ -53,7 -53,7 +53,7 @@@ static struct pvclock_wall_clock *wall_
    * have elapsed since the hypervisor wrote the data. So we try to account for
    * that with system time
    */
- -static void kvm_get_wallclock(struct timespec *now)
+ +static void kvm_get_wallclock(struct timespec64 *now)
   {
         struct pvclock_vcpu_time_info *vcpu_time;
         int low, high;
@@@ -72,7 -72,7 +72,7 @@@
         put_cpu();
   }
   
- -static int kvm_set_wallclock(const struct timespec *now)
+ +static int kvm_set_wallclock(const struct timespec64 *now)
   {
         return -ENODEV;
   }
@@@ -138,6 -138,7 +138,7 @@@ static unsigned long kvm_get_tsc_khz(vo
         src = &hv_clock[cpu].pvti;
         tsc_khz = pvclock_tsc_khz(src);
         put_cpu();
+       setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
         return tsc_khz;
   }
   
@@@ -319,6 -320,8 +320,8 @@@ void __init kvmclock_init(void
         printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
                 msr_kvm_system_time, msr_kvm_wall_clock);
   
+       pvclock_set_pvti_cpu0_va(hv_clock);
+ 
         if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
                 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
   
@@@ -366,14 -369,11 +369,11 @@@ int __init kvm_setup_vsyscall_timeinfo(
         vcpu_time = &hv_clock[cpu].pvti;
         flags = pvclock_read_flags(vcpu_time);
   
-       if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
-               put_cpu();
-               return 1;
-       }
- 
-       pvclock_set_pvti_cpu0_va(hv_clock);
         put_cpu();
   
+       if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+               return 1;
+ 
         kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
   #endif
         return 0;
diff --combined arch/x86/kvm/vmx.c

index 1689f433f3a081382795ae16120710329b73a6ad,c3c85908b8de0bd62122d5076ad87a4bfd1e3fe0..e30da9a2430cad425c56decdb5dd284c381fd9bc
--- 1/arch/x86/kvm/vmx.c
--- 2/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -242,11 -242,7 +242,11 @@@ struct shared_msr_entry 
    * underlying hardware which will be used to run L2.
    * This structure is packed to ensure that its layout is identical across
    * machines (necessary for live migration).
- - * If there are changes in this struct, VMCS12_REVISION must be changed.
+ + *
+ + * IMPORTANT: Changing the layout of existing fields in this structure
+ + * will break save/restore compatibility with older kvm releases. When
+ + * adding new fields, either use space in the reserved padding* arrays
+ + * or add the new fields to the end of the structure.
    */
   typedef u64 natural_width;
   struct __packed vmcs12 {
@@@ -269,14 -265,17 +269,14 @@@
         u64 virtual_apic_page_addr;
         u64 apic_access_addr;
         u64 posted_intr_desc_addr;
- -      u64 vm_function_control;
         u64 ept_pointer;
         u64 eoi_exit_bitmap0;
         u64 eoi_exit_bitmap1;
         u64 eoi_exit_bitmap2;
         u64 eoi_exit_bitmap3;
- -      u64 eptp_list_address;
         u64 xss_exit_bitmap;
         u64 guest_physical_address;
         u64 vmcs_link_pointer;
- -      u64 pml_address;
         u64 guest_ia32_debugctl;
         u64 guest_ia32_pat;
         u64 guest_ia32_efer;
@@@ -289,12 -288,7 +289,12 @@@
         u64 host_ia32_pat;
         u64 host_ia32_efer;
         u64 host_ia32_perf_global_ctrl;
- -      u64 padding64[8]; /* room for future expansion */
+ +      u64 vmread_bitmap;
+ +      u64 vmwrite_bitmap;
+ +      u64 vm_function_control;
+ +      u64 eptp_list_address;
+ +      u64 pml_address;
+ +      u64 padding64[3]; /* room for future expansion */
         /*
          * To allow migration of L1 (complete with its L2 guests) between
          * machines of different natural widths (32 or 64 bit), we cannot have
@@@ -403,6 -397,7 +403,6 @@@
         u16 guest_ldtr_selector;
         u16 guest_tr_selector;
         u16 guest_intr_status;
- -      u16 guest_pml_index;
         u16 host_es_selector;
         u16 host_cs_selector;
         u16 host_ss_selector;
@@@ -410,172 -405,12 +410,172 @@@
         u16 host_fs_selector;
         u16 host_gs_selector;
         u16 host_tr_selector;
+ +      u16 guest_pml_index;
   };
   
+ +/*
+ + * For save/restore compatibility, the vmcs12 field offsets must not change.
+ + */
+ +#define CHECK_OFFSET(field, loc)                              \
+ +      BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc),       \
+ +              "Offset of " #field " in struct vmcs12 has changed.")
+ +
+ +static inline void vmx_check_vmcs12_offsets(void) {
+ +      CHECK_OFFSET(revision_id, 0);
+ +      CHECK_OFFSET(abort, 4);
+ +      CHECK_OFFSET(launch_state, 8);
+ +      CHECK_OFFSET(io_bitmap_a, 40);
+ +      CHECK_OFFSET(io_bitmap_b, 48);
+ +      CHECK_OFFSET(msr_bitmap, 56);
+ +      CHECK_OFFSET(vm_exit_msr_store_addr, 64);
+ +      CHECK_OFFSET(vm_exit_msr_load_addr, 72);
+ +      CHECK_OFFSET(vm_entry_msr_load_addr, 80);
+ +      CHECK_OFFSET(tsc_offset, 88);
+ +      CHECK_OFFSET(virtual_apic_page_addr, 96);
+ +      CHECK_OFFSET(apic_access_addr, 104);
+ +      CHECK_OFFSET(posted_intr_desc_addr, 112);
+ +      CHECK_OFFSET(ept_pointer, 120);
+ +      CHECK_OFFSET(eoi_exit_bitmap0, 128);
+ +      CHECK_OFFSET(eoi_exit_bitmap1, 136);
+ +      CHECK_OFFSET(eoi_exit_bitmap2, 144);
+ +      CHECK_OFFSET(eoi_exit_bitmap3, 152);
+ +      CHECK_OFFSET(xss_exit_bitmap, 160);
+ +      CHECK_OFFSET(guest_physical_address, 168);
+ +      CHECK_OFFSET(vmcs_link_pointer, 176);
+ +      CHECK_OFFSET(guest_ia32_debugctl, 184);
+ +      CHECK_OFFSET(guest_ia32_pat, 192);
+ +      CHECK_OFFSET(guest_ia32_efer, 200);
+ +      CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
+ +      CHECK_OFFSET(guest_pdptr0, 216);
+ +      CHECK_OFFSET(guest_pdptr1, 224);
+ +      CHECK_OFFSET(guest_pdptr2, 232);
+ +      CHECK_OFFSET(guest_pdptr3, 240);
+ +      CHECK_OFFSET(guest_bndcfgs, 248);
+ +      CHECK_OFFSET(host_ia32_pat, 256);
+ +      CHECK_OFFSET(host_ia32_efer, 264);
+ +      CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
+ +      CHECK_OFFSET(vmread_bitmap, 280);
+ +      CHECK_OFFSET(vmwrite_bitmap, 288);
+ +      CHECK_OFFSET(vm_function_control, 296);
+ +      CHECK_OFFSET(eptp_list_address, 304);
+ +      CHECK_OFFSET(pml_address, 312);
+ +      CHECK_OFFSET(cr0_guest_host_mask, 344);
+ +      CHECK_OFFSET(cr4_guest_host_mask, 352);
+ +      CHECK_OFFSET(cr0_read_shadow, 360);
+ +      CHECK_OFFSET(cr4_read_shadow, 368);
+ +      CHECK_OFFSET(cr3_target_value0, 376);
+ +      CHECK_OFFSET(cr3_target_value1, 384);
+ +      CHECK_OFFSET(cr3_target_value2, 392);
+ +      CHECK_OFFSET(cr3_target_value3, 400);
+ +      CHECK_OFFSET(exit_qualification, 408);
+ +      CHECK_OFFSET(guest_linear_address, 416);
+ +      CHECK_OFFSET(guest_cr0, 424);
+ +      CHECK_OFFSET(guest_cr3, 432);
+ +      CHECK_OFFSET(guest_cr4, 440);
+ +      CHECK_OFFSET(guest_es_base, 448);
+ +      CHECK_OFFSET(guest_cs_base, 456);
+ +      CHECK_OFFSET(guest_ss_base, 464);
+ +      CHECK_OFFSET(guest_ds_base, 472);
+ +      CHECK_OFFSET(guest_fs_base, 480);
+ +      CHECK_OFFSET(guest_gs_base, 488);
+ +      CHECK_OFFSET(guest_ldtr_base, 496);
+ +      CHECK_OFFSET(guest_tr_base, 504);
+ +      CHECK_OFFSET(guest_gdtr_base, 512);
+ +      CHECK_OFFSET(guest_idtr_base, 520);
+ +      CHECK_OFFSET(guest_dr7, 528);
+ +      CHECK_OFFSET(guest_rsp, 536);
+ +      CHECK_OFFSET(guest_rip, 544);
+ +      CHECK_OFFSET(guest_rflags, 552);
+ +      CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
+ +      CHECK_OFFSET(guest_sysenter_esp, 568);
+ +      CHECK_OFFSET(guest_sysenter_eip, 576);
+ +      CHECK_OFFSET(host_cr0, 584);
+ +      CHECK_OFFSET(host_cr3, 592);
+ +      CHECK_OFFSET(host_cr4, 600);
+ +      CHECK_OFFSET(host_fs_base, 608);
+ +      CHECK_OFFSET(host_gs_base, 616);
+ +      CHECK_OFFSET(host_tr_base, 624);
+ +      CHECK_OFFSET(host_gdtr_base, 632);
+ +      CHECK_OFFSET(host_idtr_base, 640);
+ +      CHECK_OFFSET(host_ia32_sysenter_esp, 648);
+ +      CHECK_OFFSET(host_ia32_sysenter_eip, 656);
+ +      CHECK_OFFSET(host_rsp, 664);
+ +      CHECK_OFFSET(host_rip, 672);
+ +      CHECK_OFFSET(pin_based_vm_exec_control, 744);
+ +      CHECK_OFFSET(cpu_based_vm_exec_control, 748);
+ +      CHECK_OFFSET(exception_bitmap, 752);
+ +      CHECK_OFFSET(page_fault_error_code_mask, 756);
+ +      CHECK_OFFSET(page_fault_error_code_match, 760);
+ +      CHECK_OFFSET(cr3_target_count, 764);
+ +      CHECK_OFFSET(vm_exit_controls, 768);
+ +      CHECK_OFFSET(vm_exit_msr_store_count, 772);
+ +      CHECK_OFFSET(vm_exit_msr_load_count, 776);
+ +      CHECK_OFFSET(vm_entry_controls, 780);
+ +      CHECK_OFFSET(vm_entry_msr_load_count, 784);
+ +      CHECK_OFFSET(vm_entry_intr_info_field, 788);
+ +      CHECK_OFFSET(vm_entry_exception_error_code, 792);
+ +      CHECK_OFFSET(vm_entry_instruction_len, 796);
+ +      CHECK_OFFSET(tpr_threshold, 800);
+ +      CHECK_OFFSET(secondary_vm_exec_control, 804);
+ +      CHECK_OFFSET(vm_instruction_error, 808);
+ +      CHECK_OFFSET(vm_exit_reason, 812);
+ +      CHECK_OFFSET(vm_exit_intr_info, 816);
+ +      CHECK_OFFSET(vm_exit_intr_error_code, 820);
+ +      CHECK_OFFSET(idt_vectoring_info_field, 824);
+ +      CHECK_OFFSET(idt_vectoring_error_code, 828);
+ +      CHECK_OFFSET(vm_exit_instruction_len, 832);
+ +      CHECK_OFFSET(vmx_instruction_info, 836);
+ +      CHECK_OFFSET(guest_es_limit, 840);
+ +      CHECK_OFFSET(guest_cs_limit, 844);
+ +      CHECK_OFFSET(guest_ss_limit, 848);
+ +      CHECK_OFFSET(guest_ds_limit, 852);
+ +      CHECK_OFFSET(guest_fs_limit, 856);
+ +      CHECK_OFFSET(guest_gs_limit, 860);
+ +      CHECK_OFFSET(guest_ldtr_limit, 864);
+ +      CHECK_OFFSET(guest_tr_limit, 868);
+ +      CHECK_OFFSET(guest_gdtr_limit, 872);
+ +      CHECK_OFFSET(guest_idtr_limit, 876);
+ +      CHECK_OFFSET(guest_es_ar_bytes, 880);
+ +      CHECK_OFFSET(guest_cs_ar_bytes, 884);
+ +      CHECK_OFFSET(guest_ss_ar_bytes, 888);
+ +      CHECK_OFFSET(guest_ds_ar_bytes, 892);
+ +      CHECK_OFFSET(guest_fs_ar_bytes, 896);
+ +      CHECK_OFFSET(guest_gs_ar_bytes, 900);
+ +      CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
+ +      CHECK_OFFSET(guest_tr_ar_bytes, 908);
+ +      CHECK_OFFSET(guest_interruptibility_info, 912);
+ +      CHECK_OFFSET(guest_activity_state, 916);
+ +      CHECK_OFFSET(guest_sysenter_cs, 920);
+ +      CHECK_OFFSET(host_ia32_sysenter_cs, 924);
+ +      CHECK_OFFSET(vmx_preemption_timer_value, 928);
+ +      CHECK_OFFSET(virtual_processor_id, 960);
+ +      CHECK_OFFSET(posted_intr_nv, 962);
+ +      CHECK_OFFSET(guest_es_selector, 964);
+ +      CHECK_OFFSET(guest_cs_selector, 966);
+ +      CHECK_OFFSET(guest_ss_selector, 968);
+ +      CHECK_OFFSET(guest_ds_selector, 970);
+ +      CHECK_OFFSET(guest_fs_selector, 972);
+ +      CHECK_OFFSET(guest_gs_selector, 974);
+ +      CHECK_OFFSET(guest_ldtr_selector, 976);
+ +      CHECK_OFFSET(guest_tr_selector, 978);
+ +      CHECK_OFFSET(guest_intr_status, 980);
+ +      CHECK_OFFSET(host_es_selector, 982);
+ +      CHECK_OFFSET(host_cs_selector, 984);
+ +      CHECK_OFFSET(host_ss_selector, 986);
+ +      CHECK_OFFSET(host_ds_selector, 988);
+ +      CHECK_OFFSET(host_fs_selector, 990);
+ +      CHECK_OFFSET(host_gs_selector, 992);
+ +      CHECK_OFFSET(host_tr_selector, 994);
+ +      CHECK_OFFSET(guest_pml_index, 996);
+ +}
+ +
   /*
    * VMCS12_REVISION is an arbitrary id that should be changed if the content or
    * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
    * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ + *
+ + * IMPORTANT: Changing this value will break save/restore compatibility with
+ + * older kvm releases.
    */
   #define VMCS12_REVISION 0x11e57ed0
   
@@@ -646,8 -481,7 +646,8 @@@ struct nested_vmx 
         bool sync_shadow_vmcs;
         bool dirty_vmcs12;
   
- -      bool change_vmcs01_virtual_x2apic_mode;
+ +      bool change_vmcs01_virtual_apic_mode;
+ +
         /* L2 must run next, and mustn't decide to exit to L1. */
         bool nested_run_pending;
   
@@@ -927,7 -761,6 +927,7 @@@ static const unsigned short vmcs_field_
         FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
         FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
         FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+ +      FIELD64(PML_ADDRESS, pml_address),
         FIELD64(TSC_OFFSET, tsc_offset),
         FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
         FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
@@@ -939,11 -772,10 +939,11 @@@
         FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
         FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
         FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
+ +      FIELD64(VMREAD_BITMAP, vmread_bitmap),
+ +      FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
         FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
         FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
         FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
- -      FIELD64(PML_ADDRESS, pml_address),
         FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
         FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
         FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
@@@ -1257,16 -1089,6 +1257,16 @@@ static inline u16 evmcs_read16(unsigne
         return *(u16 *)((char *)current_evmcs + offset);
   }
   
+ +static inline void evmcs_touch_msr_bitmap(void)
+ +{
+ +      if (unlikely(!current_evmcs))
+ +              return;
+ +
+ +      if (current_evmcs->hv_enlightenments_control.msr_bitmap)
+ +              current_evmcs->hv_clean_fields &=
+ +                      ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ +}
+ +
   static void evmcs_load(u64 phys_addr)
   {
         struct hv_vp_assist_page *vp_ap =
@@@ -1351,7 -1173,6 +1351,7 @@@ static inline u32 evmcs_read32(unsigne
   static inline u16 evmcs_read16(unsigned long field) { return 0; }
   static inline void evmcs_load(u64 phys_addr) {}
   static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
+ +static inline void evmcs_touch_msr_bitmap(void) {}
   #endif /* IS_ENABLED(CONFIG_HYPERV) */
   
   static inline bool is_exception_n(u32 intr_info, u8 vector)
@@@ -1572,11 -1393,6 +1572,11 @@@ static inline bool cpu_has_vmx_invept_g
         return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
   }
   
+ +static inline bool cpu_has_vmx_invvpid_individual_addr(void)
+ +{
+ +      return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
+ +}
+ +
   static inline bool cpu_has_vmx_invvpid_single(void)
   {
         return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
@@@ -1694,28 -1510,6 +1694,28 @@@ static inline unsigned nested_cpu_vmx_m
         return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
   }
   
+ +/*
+ + * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
+ + * to modify any valid field of the VMCS, or are the VM-exit
+ + * information fields read-only?
+ + */
+ +static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
+ +{
+ +      return to_vmx(vcpu)->nested.msrs.misc_low &
+ +              MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+ +}
+ +
+ +static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
+ +{
+ +      return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
+ +}
+ +
+ +static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
+ +{
+ +      return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
+ +                      CPU_BASED_MONITOR_TRAP_FLAG;
+ +}
+ +
   static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
   {
         return vmcs12->cpu_based_vm_exec_control & bit;
@@@ -2571,6 -2365,7 +2571,7 @@@ static void vmx_save_host_state(struct 
         struct vcpu_vmx *vmx = to_vmx(vcpu);
   #ifdef CONFIG_X86_64
         int cpu = raw_smp_processor_id();
+       unsigned long fs_base, kernel_gs_base;
   #endif
         int i;
   
@@@ -2586,12 -2381,20 +2587,20 @@@
         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
   
   #ifdef CONFIG_X86_64
-       save_fsgs_for_kvm();
-       vmx->host_state.fs_sel = current->thread.fsindex;
-       vmx->host_state.gs_sel = current->thread.gsindex;
- #else
-       savesegment(fs, vmx->host_state.fs_sel);
-       savesegment(gs, vmx->host_state.gs_sel);
+       if (likely(is_64bit_mm(current->mm))) {
+               save_fsgs_for_kvm();
+               vmx->host_state.fs_sel = current->thread.fsindex;
+               vmx->host_state.gs_sel = current->thread.gsindex;
+               fs_base = current->thread.fsbase;
+               kernel_gs_base = current->thread.gsbase;
+       } else {
+ #endif
+               savesegment(fs, vmx->host_state.fs_sel);
+               savesegment(gs, vmx->host_state.gs_sel);
+ #ifdef CONFIG_X86_64
+               fs_base = read_msr(MSR_FS_BASE);
+               kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+       }
   #endif
         if (!(vmx->host_state.fs_sel & 7)) {
                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
@@@ -2611,10 -2414,10 +2620,10 @@@
         savesegment(ds, vmx->host_state.ds_sel);
         savesegment(es, vmx->host_state.es_sel);
   
-       vmcs_writel(HOST_FS_BASE, current->thread.fsbase);
+       vmcs_writel(HOST_FS_BASE, fs_base);
         vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
   
-       vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+       vmx->msr_host_kernel_gs_base = kernel_gs_base;
         if (is_long_mode(&vmx->vcpu))
                 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
   #else
@@@ -3333,7 -3136,6 +3342,7 @@@ static void nested_vmx_setup_ctls_msrs(
                 msrs->misc_high);
         msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
         msrs->misc_low |=
+ +              MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
                 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
                 VMX_MISC_ACTIVITY_HLT;
         msrs->misc_high = 0;
@@@ -3507,15 -3309,6 +3516,15 @@@ static int vmx_restore_vmx_misc(struct 
   
         vmx->nested.msrs.misc_low = data;
         vmx->nested.msrs.misc_high = data >> 32;
+ +
+ +      /*
+ +       * If L1 has read-only VM-exit information fields, use the
+ +       * less permissive vmx_vmwrite_bitmap to specify write
+ +       * permissions for the shadow VMCS.
+ +       */
+ +      if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
+ +              vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+ +
         return 0;
   }
   
@@@ -3570,13 -3363,6 +3579,13 @@@ static int vmx_set_vmx_msr(struct kvm_v
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
   
+ +      /*
+ +       * Don't allow changes to the VMX capability MSRs while the vCPU
+ +       * is in VMX operation.
+ +       */
+ +      if (vmx->nested.vmxon)
+ +              return -EBUSY;
+ +
         switch (msr_index) {
         case MSR_IA32_VMX_BASIC:
                 return vmx_restore_vmx_basic(vmx, data);
@@@ -4322,11 -4108,7 +4331,7 @@@ static __init int setup_vmcs_config(str
         vmcs_conf->order = get_order(vmcs_conf->size);
         vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
   
-       /* KVM supports Enlightened VMCS v1 only */
-       if (static_branch_unlikely(&enable_evmcs))
-               vmcs_conf->revision_id = KVM_EVMCS_VERSION;
-       else
-               vmcs_conf->revision_id = vmx_msr_low;
+       vmcs_conf->revision_id = vmx_msr_low;
   
         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
@@@ -4396,7 -4178,13 +4401,13 @@@ static struct vmcs *alloc_vmcs_cpu(int 
                 return NULL;
         vmcs = page_address(pages);
         memset(vmcs, 0, vmcs_config.size);
-       vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
+ 
+       /* KVM supports Enlightened VMCS v1 only */
+       if (static_branch_unlikely(&enable_evmcs))
+               vmcs->revision_id = KVM_EVMCS_VERSION;
+       else
+               vmcs->revision_id = vmcs_config.revision_id;
+ 
         return vmcs;
   }
   
@@@ -4439,15 -4227,6 +4450,15 @@@ static int alloc_loaded_vmcs(struct loa
                 if (!loaded_vmcs->msr_bitmap)
                         goto out_vmcs;
                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+ +
+ +              if (IS_ENABLED(CONFIG_HYPERV) &&
+ +                  static_branch_unlikely(&enable_evmcs) &&
+ +                  (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ +                      struct hv_enlightened_vmcs *evmcs =
+ +                              (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
+ +
+ +                      evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ +              }
         }
         return 0;
   
@@@ -4564,6 -4343,19 +4575,19 @@@ static __init int alloc_kvm_area(void
                         return -ENOMEM;
                 }
   
+               /*
+                * When eVMCS is enabled, alloc_vmcs_cpu() sets
+                * vmcs->revision_id to KVM_EVMCS_VERSION instead of
+                * revision_id reported by MSR_IA32_VMX_BASIC.
+                *
+                * However, even though not explictly documented by
+                * TLFS, VMXArea passed as VMXON argument should
+                * still be marked with revision_id reported by
+                * physical CPU.
+                */
+               if (static_branch_unlikely(&enable_evmcs))
+                       vmcs->revision_id = vmcs_config.revision_id;
+ 
                 per_cpu(vmxarea, cpu) = vmcs;
         }
         return 0;
@@@ -5561,9 -5353,6 +5585,9 @@@ static void __always_inline vmx_disable
         if (!cpu_has_vmx_msr_bitmap())
                 return;
   
+ +      if (static_branch_unlikely(&enable_evmcs))
+ +              evmcs_touch_msr_bitmap();
+ +
         /*
          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
          * have the write-low and read-high bitmap offsets the wrong way round.
@@@ -5599,9 -5388,6 +5623,9 @@@ static void __always_inline vmx_enable_
         if (!cpu_has_vmx_msr_bitmap())
                 return;
   
+ +      if (static_branch_unlikely(&enable_evmcs))
+ +              evmcs_touch_msr_bitmap();
+ +
         /*
          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
          * have the write-low and read-high bitmap offsets the wrong way round.
@@@ -6184,14 -5970,8 +6208,14 @@@ static void vmx_vcpu_setup(struct vcpu_
         int i;
   
         if (enable_shadow_vmcs) {
+ +              /*
+ +               * At vCPU creation, "VMWRITE to any supported field
+ +               * in the VMCS" is supported, so use the more
+ +               * permissive vmx_vmread_bitmap to specify both read
+ +               * and write permissions for the shadow VMCS.
+ +               */
                 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
- -              vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+ +              vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
         }
         if (cpu_has_vmx_msr_bitmap())
                 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
@@@ -7832,7 -7612,8 +7856,7 @@@ static int nested_vmx_get_vmptr(struct 
                         vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
                 return 1;
   
- -      if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
- -                              sizeof(*vmpointer), &e)) {
+ +      if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@@ -7913,12 -7694,6 +7937,12 @@@ static int handle_vmon(struct kvm_vcpu 
                 return 1;
         }
   
+ +      /* CPL=0 must be checked manually. */
+ +      if (vmx_get_cpl(vcpu)) {
+ +              kvm_queue_exception(vcpu, UD_VECTOR);
+ +              return 1;
+ +      }
+ +
         if (vmx->nested.vmxon) {
                 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
                 return kvm_skip_emulated_instruction(vcpu);
@@@ -7978,11 -7753,6 +8002,11 @@@
    */
   static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
   {
+ +      if (vmx_get_cpl(vcpu)) {
+ +              kvm_queue_exception(vcpu, UD_VECTOR);
+ +              return 0;
+ +      }
+ +
         if (!to_vmx(vcpu)->nested.vmxon) {
                 kvm_queue_exception(vcpu, UD_VECTOR);
                 return 0;
@@@ -8182,42 -7952,23 +8206,42 @@@ static inline int vmcs12_write_any(stru
   
   }
   
+ +/*
+ + * Copy the writable VMCS shadow fields back to the VMCS12, in case
+ + * they have been modified by the L1 guest. Note that the "read-only"
+ + * VM-exit information fields are actually writable if the vCPU is
+ + * configured to support "VMWRITE to any supported field in the VMCS."
+ + */
   static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
   {
- -      int i;
+ +      const u16 *fields[] = {
+ +              shadow_read_write_fields,
+ +              shadow_read_only_fields
+ +      };
+ +      const int max_fields[] = {
+ +              max_shadow_read_write_fields,
+ +              max_shadow_read_only_fields
+ +      };
+ +      int i, q;
         unsigned long field;
         u64 field_value;
         struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
- -      const u16 *fields = shadow_read_write_fields;
- -      const int num_fields = max_shadow_read_write_fields;
   
         preempt_disable();
   
         vmcs_load(shadow_vmcs);
   
- -      for (i = 0; i < num_fields; i++) {
- -              field = fields[i];
- -              field_value = __vmcs_readl(field);
- -              vmcs12_write_any(&vmx->vcpu, field, field_value);
+ +      for (q = 0; q < ARRAY_SIZE(fields); q++) {
+ +              for (i = 0; i < max_fields[q]; i++) {
+ +                      field = fields[q][i];
+ +                      field_value = __vmcs_readl(field);
+ +                      vmcs12_write_any(&vmx->vcpu, field, field_value);
+ +              }
+ +              /*
+ +               * Skip the VM-exit information fields if they are read-only.
+ +               */
+ +              if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
+ +                      break;
         }
   
         vmcs_clear(shadow_vmcs);
@@@ -8302,9 -8053,9 +8326,9 @@@ static int handle_vmread(struct kvm_vcp
                 if (get_vmx_mem_address(vcpu, exit_qualification,
                                 vmx_instruction_info, true, &gva))
                         return 1;
- -              /* _system ok, as hardware has verified cpl=0 */
- -              kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
- -                           &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
+ +              /* _system ok, nested_vmx_check_permission has verified cpl=0 */
+ +              kvm_write_guest_virt_system(vcpu, gva, &field_value,
+ +                                          (is_long_mode(vcpu) ? 8 : 4), NULL);
         }
   
         nested_vmx_succeed(vcpu);
@@@ -8342,8 -8093,8 +8366,8 @@@ static int handle_vmwrite(struct kvm_vc
                 if (get_vmx_mem_address(vcpu, exit_qualification,
                                 vmx_instruction_info, false, &gva))
                         return 1;
- -              if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
- -                         &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+ +              if (kvm_read_guest_virt(vcpu, gva, &field_value,
+ +                                      (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
                         kvm_inject_page_fault(vcpu, &e);
                         return 1;
                 }
@@@ -8351,12 -8102,7 +8375,12 @@@
   
   
         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
- -      if (vmcs_field_readonly(field)) {
+ +      /*
+ +       * If the vCPU supports "VMWRITE to any supported field in the
+ +       * VMCS," then the "read-only" fields are actually read/write.
+ +       */
+ +      if (vmcs_field_readonly(field) &&
+ +          !nested_cpu_has_vmwrite_any_field(vcpu)) {
                 nested_vmx_failValid(vcpu,
                         VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
                 return kvm_skip_emulated_instruction(vcpu);
@@@ -8467,10 -8213,10 +8491,10 @@@ static int handle_vmptrst(struct kvm_vc
         if (get_vmx_mem_address(vcpu, exit_qualification,
                         vmx_instruction_info, true, &vmcs_gva))
                 return 1;
- -      /* ok to use *_system, as hardware has verified cpl=0 */
- -      if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
- -                               (void *)&to_vmx(vcpu)->nested.current_vmptr,
- -                               sizeof(u64), &e)) {
+ +      /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
+ +      if (kvm_write_guest_virt_system(vcpu, vmcs_gva,
+ +                                      (void *)&to_vmx(vcpu)->nested.current_vmptr,
+ +                                      sizeof(u64), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@@ -8517,7 -8263,8 +8541,7 @@@ static int handle_invept(struct kvm_vcp
         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
                         vmx_instruction_info, false, &gva))
                 return 1;
- -      if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
- -                              sizeof(operand), &e)) {
+ +      if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@@ -8581,7 -8328,8 +8605,7 @@@ static int handle_invvpid(struct kvm_vc
         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
                         vmx_instruction_info, false, &gva))
                 return 1;
- -      if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
- -                              sizeof(operand), &e)) {
+ +      if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@@ -8593,19 -8341,12 +8617,19 @@@
   
         switch (type) {
         case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
- -              if (is_noncanonical_address(operand.gla, vcpu)) {
+ +              if (!operand.vpid ||
+ +                  is_noncanonical_address(operand.gla, vcpu)) {
                         nested_vmx_failValid(vcpu,
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
                         return kvm_skip_emulated_instruction(vcpu);
                 }
- -              /* fall through */
+ +              if (cpu_has_vmx_invvpid_individual_addr() &&
+ +                  vmx->nested.vpid02) {
+ +                      __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
+ +                              vmx->nested.vpid02, operand.gla);
+ +              } else
+ +                      __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ +              break;
         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
                 if (!operand.vpid) {
@@@ -8613,16 -8354,15 +8637,16 @@@
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
                         return kvm_skip_emulated_instruction(vcpu);
                 }
+ +              __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
                 break;
         case VMX_VPID_EXTENT_ALL_CONTEXT:
+ +              __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
                 break;
         default:
                 WARN_ON_ONCE(1);
                 return kvm_skip_emulated_instruction(vcpu);
         }
   
- -      __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
         nested_vmx_succeed(vcpu);
   
         return kvm_skip_emulated_instruction(vcpu);
@@@ -9126,13 -8866,11 +9150,13 @@@ static bool nested_vmx_exit_reflected(s
         case EXIT_REASON_TPR_BELOW_THRESHOLD:
                 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
         case EXIT_REASON_APIC_ACCESS:
- -              return nested_cpu_has2(vmcs12,
- -                      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
         case EXIT_REASON_APIC_WRITE:
         case EXIT_REASON_EOI_INDUCED:
- -              /* apic_write and eoi_induced should exit unconditionally. */
+ +              /*
+ +               * The controls for "virtualize APIC accesses," "APIC-
+ +               * register virtualization," and "virtual-interrupt
+ +               * delivery" only come from vmcs12.
+ +               */
                 return true;
         case EXIT_REASON_EPT_VIOLATION:
                 /*
@@@ -9539,43 -9277,31 +9563,43 @@@ static void update_cr8_intercept(struc
         vmcs_write32(TPR_THRESHOLD, irr);
   }
   
- -static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
+ +static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
   {
         u32 sec_exec_control;
   
+ +      if (!lapic_in_kernel(vcpu))
+ +              return;
+ +
         /* Postpone execution until vmcs01 is the current VMCS. */
         if (is_guest_mode(vcpu)) {
- -              to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+ +              to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
                 return;
         }
   
- -      if (!cpu_has_vmx_virtualize_x2apic_mode())
- -              return;
- -
         if (!cpu_need_tpr_shadow(vcpu))
                 return;
   
         sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ +      sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ +                            SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
   
- -      if (set) {
- -              sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- -              sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
- -      } else {
- -              sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
- -              sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- -              vmx_flush_tlb(vcpu, true);
+ +      switch (kvm_get_apic_mode(vcpu)) {
+ +      case LAPIC_MODE_INVALID:
+ +              WARN_ONCE(true, "Invalid local APIC state");
+ +      case LAPIC_MODE_DISABLED:
+ +              break;
+ +      case LAPIC_MODE_XAPIC:
+ +              if (flexpriority_enabled) {
+ +                      sec_exec_control |=
+ +                              SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ +                      vmx_flush_tlb(vcpu, true);
+ +              }
+ +              break;
+ +      case LAPIC_MODE_X2APIC:
+ +              if (cpu_has_vmx_virtualize_x2apic_mode())
+ +                      sec_exec_control |=
+ +                              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ +              break;
         }
         vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
   
@@@ -9584,7 -9310,24 +9608,7 @@@
   
   static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
   {
- -      struct vcpu_vmx *vmx = to_vmx(vcpu);
- -
- -      /*
- -       * Currently we do not handle the nested case where L2 has an
- -       * APIC access page of its own; that page is still pinned.
- -       * Hence, we skip the case where the VCPU is in guest mode _and_
- -       * L1 prepared an APIC access page for L2.
- -       *
- -       * For the case where L1 and L2 share the same APIC access page
- -       * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear
- -       * in the vmcs12), this function will only update either the vmcs01
- -       * or the vmcs02.  If the former, the vmcs02 will be updated by
- -       * prepare_vmcs02.  If the latter, the vmcs01 will be updated in
- -       * the next L2->L1 exit.
- -       */
- -      if (!is_guest_mode(vcpu) ||
- -          !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
- -                           SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ +      if (!is_guest_mode(vcpu)) {
                 vmcs_write64(APIC_ACCESS_ADDR, hpa);
                 vmx_flush_tlb(vcpu, true);
         }
@@@ -10224,13 -9967,13 +10248,13 @@@ STACK_FRAME_NON_STANDARD(vmx_vcpu_run)
   
   static struct kvm *vmx_vm_alloc(void)
   {
- -      struct kvm_vmx *kvm_vmx = kzalloc(sizeof(struct kvm_vmx), GFP_KERNEL);
+ +      struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
         return &kvm_vmx->kvm;
   }
   
   static void vmx_vm_free(struct kvm *kvm)
   {
- -      kfree(to_kvm_vmx(kvm));
+ +      vfree(to_kvm_vmx(kvm));
   }
   
   static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
@@@ -10668,6 -10411,11 +10692,6 @@@ static void nested_get_vmcs12_pages(str
                         vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
                                         SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
                 }
- -      } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
- -                 cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
- -              vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
- -                            SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
- -              kvm_vcpu_reload_apic_access_page(vcpu);
         }
   
         if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
@@@ -11147,7 -10895,8 +11171,7 @@@ static int nested_vmx_load_cr3(struct k
         return 0;
   }
   
- -static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- -                             bool from_vmentry)
+ +static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
   
@@@ -11281,13 -11030,13 +11305,13 @@@
    * is assigned to entry_failure_code on failure.
    */
   static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- -                        bool from_vmentry, u32 *entry_failure_code)
+ +                        u32 *entry_failure_code)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 exec_control, vmcs12_exec_ctrl;
   
         if (vmx->nested.dirty_vmcs12) {
- -              prepare_vmcs02_full(vcpu, vmcs12, from_vmentry);
+ +              prepare_vmcs02_full(vcpu, vmcs12);
                 vmx->nested.dirty_vmcs12 = false;
         }
   
@@@ -11307,7 -11056,7 +11331,7 @@@
          * HOST_FS_BASE, HOST_GS_BASE.
          */
   
- -      if (from_vmentry &&
+ +      if (vmx->nested.nested_run_pending &&
             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
                 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
@@@ -11315,7 -11064,7 +11339,7 @@@
                 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
                 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
         }
- -      if (from_vmentry) {
+ +      if (vmx->nested.nested_run_pending) {
                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                              vmcs12->vm_entry_intr_info_field);
                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
@@@ -11447,7 -11196,7 +11471,7 @@@
                         ~VM_ENTRY_IA32E_MODE) |
                 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
   
- -      if (from_vmentry &&
+ +      if (vmx->nested.nested_run_pending &&
             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
                 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
                 vcpu->arch.pat = vmcs12->guest_ia32_pat;
@@@ -11472,7 -11221,7 +11496,7 @@@
                 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
                         if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
                                 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- -                              __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true);
+ +                              __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
                         }
                 } else {
                         vmx_flush_tlb(vcpu, true);
@@@ -11515,7 -11264,7 +11539,7 @@@
         vmx_set_cr4(vcpu, vmcs12->guest_cr4);
         vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
   
- -      if (from_vmentry &&
+ +      if (vmx->nested.nested_run_pending &&
             (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
                 vcpu->arch.efer = vmcs12->guest_ia32_efer;
         else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@@ -11631,62 -11380,6 +11655,62 @@@ static int check_vmentry_prereqs(struc
             !nested_cr3_valid(vcpu, vmcs12->host_cr3))
                 return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
   
+ +      /*
+ +       * From the Intel SDM, volume 3:
+ +       * Fields relevant to VM-entry event injection must be set properly.
+ +       * These fields are the VM-entry interruption-information field, the
+ +       * VM-entry exception error code, and the VM-entry instruction length.
+ +       */
+ +      if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
+ +              u32 intr_info = vmcs12->vm_entry_intr_info_field;
+ +              u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
+ +              u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
+ +              bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
+ +              bool should_have_error_code;
+ +              bool urg = nested_cpu_has2(vmcs12,
+ +                                         SECONDARY_EXEC_UNRESTRICTED_GUEST);
+ +              bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
+ +
+ +              /* VM-entry interruption-info field: interruption type */
+ +              if (intr_type == INTR_TYPE_RESERVED ||
+ +                  (intr_type == INTR_TYPE_OTHER_EVENT &&
+ +                   !nested_cpu_supports_monitor_trap_flag(vcpu)))
+ +                      return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ +
+ +              /* VM-entry interruption-info field: vector */
+ +              if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
+ +                  (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
+ +                  (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
+ +                      return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ +
+ +              /* VM-entry interruption-info field: deliver error code */
+ +              should_have_error_code =
+ +                      intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
+ +                      x86_exception_has_error_code(vector);
+ +              if (has_error_code != should_have_error_code)
+ +                      return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ +
+ +              /* VM-entry exception error code */
+ +              if (has_error_code &&
+ +                  vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))
+ +                      return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ +
+ +              /* VM-entry interruption-info field: reserved bits */
+ +              if (intr_info & INTR_INFO_RESVD_BITS_MASK)
+ +                      return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ +
+ +              /* VM-entry instruction length */
+ +              switch (intr_type) {
+ +              case INTR_TYPE_SOFT_EXCEPTION:
+ +              case INTR_TYPE_SOFT_INTR:
+ +              case INTR_TYPE_PRIV_SW_EXCEPTION:
+ +                      if ((vmcs12->vm_entry_instruction_len > 15) ||
+ +                          (vmcs12->vm_entry_instruction_len == 0 &&
+ +                           !nested_cpu_has_zero_length_injection(vcpu)))
+ +                              return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ +              }
+ +      }
+ +
         return 0;
   }
   
@@@ -11749,11 -11442,10 +11773,10 @@@ static int check_vmentry_postreqs(struc
         return 0;
   }
   
- -static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+ +static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
   {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       u32 msr_entry_idx;
         u32 exit_qual;
         int r;
   
@@@ -11769,16 -11461,16 +11792,16 @@@
                 vcpu->arch.tsc_offset += vmcs12->tsc_offset;
   
         r = EXIT_REASON_INVALID_STATE;
- -      if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
+ +      if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
                 goto fail;
   
         nested_get_vmcs12_pages(vcpu, vmcs12);
   
         r = EXIT_REASON_MSR_LOAD_FAIL;
-       msr_entry_idx = nested_vmx_load_msr(vcpu,
-                                           vmcs12->vm_entry_msr_load_addr,
-                                           vmcs12->vm_entry_msr_load_count);
-       if (msr_entry_idx)
+       exit_qual = nested_vmx_load_msr(vcpu,
+                                       vmcs12->vm_entry_msr_load_addr,
+                                       vmcs12->vm_entry_msr_load_count);
+       if (exit_qual)
                 goto fail;
   
         /*
@@@ -11871,22 -11563,20 +11894,22 @@@ static int nested_vmx_run(struct kvm_vc
          * the nested entry.
          */
   
- -      ret = enter_vmx_non_root_mode(vcpu, true);
- -      if (ret)
+ +      vmx->nested.nested_run_pending = 1;
+ +      ret = enter_vmx_non_root_mode(vcpu);
+ +      if (ret) {
+ +              vmx->nested.nested_run_pending = 0;
                 return ret;
+ +      }
   
         /*
          * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
          * by event injection, halt vcpu.
          */
         if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
- -          !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK))
+ +          !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) {
+ +              vmx->nested.nested_run_pending = 0;
                 return kvm_vcpu_halt(vcpu);
- -
- -      vmx->nested.nested_run_pending = 1;
- -
+ +      }
         return 1;
   
   out:
@@@ -12258,20 -11948,12 +12281,20 @@@ static void load_vmcs12_host_state(stru
   
         load_vmcs12_mmu_host_state(vcpu, vmcs12);
   
- -      if (enable_vpid) {
- -              /*
- -               * Trivially support vpid by letting L2s share their parent
- -               * L1's vpid. TODO: move to a more elaborate solution, giving
- -               * each L2 its own vpid and exposing the vpid feature to L1.
- -               */
+ +      /*
+ +       * If vmcs01 don't use VPID, CPU flushes TLB on every
+ +       * VMEntry/VMExit. Thus, no need to flush TLB.
+ +       *
+ +       * If vmcs12 uses VPID, TLB entries populated by L2 are
+ +       * tagged with vmx->nested.vpid02 while L1 entries are tagged
+ +       * with vmx->vpid. Thus, no need to flush TLB.
+ +       *
+ +       * Therefore, flush TLB only in case vmcs01 uses VPID and
+ +       * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
+ +       * are both tagged with vmx->vpid.
+ +       */
+ +      if (enable_vpid &&
+ +          !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
                 vmx_flush_tlb(vcpu, true);
         }
   
@@@ -12410,9 -12092,10 +12433,9 @@@ static void nested_vmx_vmexit(struct kv
         if (kvm_has_tsc_control)
                 decache_tsc_multiplier(vmx);
   
- -      if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
- -              vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
- -              vmx_set_virtual_x2apic_mode(vcpu,
- -                              vcpu->arch.apic_base & X2APIC_ENABLE);
+ +      if (vmx->nested.change_vmcs01_virtual_apic_mode) {
+ +              vmx->nested.change_vmcs01_virtual_apic_mode = false;
+ +              vmx_set_virtual_apic_mode(vcpu);
         } else if (!nested_cpu_has_ept(vmcs12) &&
                    nested_cpu_has2(vmcs12,
                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@@ -12576,7 -12259,7 +12599,7 @@@ static inline int u64_shl_div_u64(u64 a
   static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
   {
         struct vcpu_vmx *vmx;
- -      u64 tscl, guest_tscl, delta_tsc;
+ +      u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
   
         if (kvm_mwait_in_guest(vcpu->kvm))
                 return -EOPNOTSUPP;
@@@ -12585,12 -12268,6 +12608,12 @@@
         tscl = rdtsc();
         guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
         delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+ +      lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+ +
+ +      if (delta_tsc > lapic_timer_advance_cycles)
+ +              delta_tsc -= lapic_timer_advance_cycles;
+ +      else
+ +              delta_tsc = 0;
   
         /* Convert to host delta tsc if tsc scaling is enabled */
         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
@@@ -12961,7 -12638,7 +12984,7 @@@ static int vmx_pre_leave_smm(struct kvm
   
         if (vmx->nested.smm.guest_mode) {
                 vcpu->arch.hflags &= ~HF_SMM_MASK;
- -              ret = enter_vmx_non_root_mode(vcpu, false);
+ +              ret = enter_vmx_non_root_mode(vcpu);
                 vcpu->arch.hflags |= HF_SMM_MASK;
                 if (ret)
                         return ret;
@@@ -13046,7 -12723,7 +13069,7 @@@ static struct kvm_x86_ops vmx_x86_ops _
         .enable_nmi_window = enable_nmi_window,
         .enable_irq_window = enable_irq_window,
         .update_cr8_intercept = update_cr8_intercept,
- -      .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
+ +      .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
         .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
         .get_enable_apicv = vmx_get_enable_apicv,
         .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
@@@ -13158,7 -12835,6 +13181,7 @@@ static int __init vmx_init(void
         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
                            crash_vmclear_local_loaded_vmcss);
   #endif
+ +      vmx_check_vmcs12_offsets();
   
         return 0;
   }
diff --combined arch/x86/kvm/x86.c

index 0046aa70205aa2dfbc0577065250be717ca25b4e,b91c1e1ff45950ee37c59fa4e8cae2053e027451..2b812b3c50881d2b42738792a7ef1a72cdcb9d66
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -138,7 -138,6 +138,7 @@@ module_param(tsc_tolerance_ppm, uint, S
   /* lapic timer advance (tscdeadline mode only) in nanoseconds */
   unsigned int __read_mostly lapic_timer_advance_ns = 0;
   module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+ +EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
   
   static bool __read_mostly vector_hashing = true;
   module_param(vector_hashing, bool, S_IRUGO);
@@@ -319,27 -318,23 +319,27 @@@ u64 kvm_get_apic_base(struct kvm_vcpu *
   }
   EXPORT_SYMBOL_GPL(kvm_get_apic_base);
   
+ +enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
+ +{
+ +      return kvm_apic_mode(kvm_get_apic_base(vcpu));
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
+ +
   int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
   {
- -      u64 old_state = vcpu->arch.apic_base &
- -              (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
- -      u64 new_state = msr_info->data &
- -              (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+ +      enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
+ +      enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
         u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
                 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
   
- -      if ((msr_info->data & reserved_bits) || new_state == X2APIC_ENABLE)
- -              return 1;
- -      if (!msr_info->host_initiated &&
- -          ((new_state == MSR_IA32_APICBASE_ENABLE &&
- -            old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
- -           (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
- -            old_state == 0)))
+ +      if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
                 return 1;
+ +      if (!msr_info->host_initiated) {
+ +              if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
+ +                      return 1;
+ +              if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
+ +                      return 1;
+ +      }
   
         kvm_lapic_set_base(vcpu, msr_info->data);
         return 0;
@@@ -861,7 -856,7 +861,7 @@@ int kvm_set_cr3(struct kvm_vcpu *vcpu, 
         }
   
         if (is_long_mode(vcpu) &&
- -          (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62)))
+ +          (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
                 return 1;
         else if (is_pae(vcpu) && is_paging(vcpu) &&
                    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
@@@ -1097,6 -1092,7 +1097,7 @@@ static u32 msr_based_features[] = 
   
         MSR_F10H_DECFG,
         MSR_IA32_UCODE_REV,
+       MSR_IA32_ARCH_CAPABILITIES,
   };
   
   static unsigned int num_msr_based_features;
@@@ -1105,7 -1101,8 +1106,8 @@@ static int kvm_get_msr_feature(struct k
   {
         switch (msr->index) {
         case MSR_IA32_UCODE_REV:
-               rdmsrl(msr->index, msr->data);
+       case MSR_IA32_ARCH_CAPABILITIES:
+               rdmsrl_safe(msr->index, &msr->data);
                 break;
         default:
                 if (kvm_x86_ops->get_msr_feature(msr))
@@@ -1766,7 -1763,7 +1768,7 @@@ static int do_monotonic_boot(s64 *t, u6
         return mode;
   }
   
- -static int do_realtime(struct timespec *ts, u64 *tsc_timestamp)
+ +static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
   {
         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
         unsigned long seq;
@@@ -1799,7 -1796,7 +1801,7 @@@ static bool kvm_get_time_and_clockread(
   }
   
   /* returns true if host is using TSC based clocksource */
- -static bool kvm_get_walltime_and_clockread(struct timespec *ts,
+ +static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
                                            u64 *tsc_timestamp)
   {
         /* checked again under seqlock below */
@@@ -2873,7 -2870,6 +2875,7 @@@ int kvm_vm_ioctl_check_extension(struc
         case KVM_CAP_HYPERV_SYNIC2:
         case KVM_CAP_HYPERV_VP_INDEX:
         case KVM_CAP_HYPERV_EVENTFD:
+ +      case KVM_CAP_HYPERV_TLBFLUSH:
         case KVM_CAP_PCI_SEGMENT:
         case KVM_CAP_DEBUGREGS:
         case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@@ -2900,7 -2896,7 +2902,7 @@@
                 r = KVM_CLOCK_TSC_STABLE;
                 break;
         case KVM_CAP_X86_DISABLE_EXITS:
- -              r |=  KVM_X86_DISABLE_EXITS_HTL | KVM_X86_DISABLE_EXITS_PAUSE;
+ +              r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
                 if(kvm_can_mwait_in_guest())
                         r |= KVM_X86_DISABLE_EXITS_MWAIT;
                 break;
@@@ -3968,7 -3964,7 +3970,7 @@@ out_nofree
         return r;
   }
   
- -int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+ +vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
   {
         return VM_FAULT_SIGBUS;
   }
@@@ -4254,7 -4250,7 +4256,7 @@@ split_irqchip_unlock
                 if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
                         kvm_can_mwait_in_guest())
                         kvm->arch.mwait_in_guest = true;
- -              if (cap->args[0] & KVM_X86_DISABLE_EXITS_HTL)
+ +              if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
                         kvm->arch.hlt_in_guest = true;
                 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
                         kvm->arch.pause_in_guest = true;
@@@ -4793,10 -4789,11 +4795,10 @@@ static int kvm_fetch_guest_virt(struct 
         return X86EMUL_CONTINUE;
   }
   
- -int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+ +int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
                                gva_t addr, void *val, unsigned int bytes,
                                struct x86_exception *exception)
   {
- -      struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
   
         return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
@@@ -4804,17 -4801,12 +4806,17 @@@
   }
   EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
   
- -static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
- -                                    gva_t addr, void *val, unsigned int bytes,
- -                                    struct x86_exception *exception)
+ +static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
+ +                           gva_t addr, void *val, unsigned int bytes,
+ +                           struct x86_exception *exception, bool system)
   {
         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- -      return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
+ +      u32 access = 0;
+ +
+ +      if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ +              access |= PFERR_USER_MASK;
+ +
+ +      return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
   }
   
   static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
@@@ -4826,16 -4818,18 +4828,16 @@@
         return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
   }
   
- -int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
- -                                     gva_t addr, void *val,
- -                                     unsigned int bytes,
- -                                     struct x86_exception *exception)
+ +static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ +                                    struct kvm_vcpu *vcpu, u32 access,
+ +                                    struct x86_exception *exception)
   {
- -      struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
         void *data = val;
         int r = X86EMUL_CONTINUE;
   
         while (bytes) {
                 gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
- -                                                           PFERR_WRITE_MASK,
+ +                                                           access,
                                                              exception);
                 unsigned offset = addr & (PAGE_SIZE-1);
                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
@@@ -4856,27 -4850,6 +4858,27 @@@
   out:
         return r;
   }
+ +
+ +static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
+ +                            unsigned int bytes, struct x86_exception *exception,
+ +                            bool system)
+ +{
+ +      struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ +      u32 access = PFERR_WRITE_MASK;
+ +
+ +      if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ +              access |= PFERR_USER_MASK;
+ +
+ +      return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ +                                         access, exception);
+ +}
+ +
+ +int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
+ +                              unsigned int bytes, struct x86_exception *exception)
+ +{
+ +      return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ +                                         PFERR_WRITE_MASK, exception);
+ +}
   EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
   
   int handle_ud(struct kvm_vcpu *vcpu)
@@@ -4887,8 -4860,8 +4889,8 @@@
         struct x86_exception e;
   
         if (force_emulation_prefix &&
- -          kvm_read_guest_virt(&vcpu->arch.emulate_ctxt,
- -                              kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 &&
+ +          kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
+ +                              sig, sizeof(sig), &e) == 0 &&
             memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
                 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
                 emul_type = 0;
@@@ -5629,8 -5602,8 +5631,8 @@@ static int emulator_pre_leave_smm(struc
   static const struct x86_emulate_ops emulate_ops = {
         .read_gpr            = emulator_read_gpr,
         .write_gpr           = emulator_write_gpr,
- -      .read_std            = kvm_read_guest_virt_system,
- -      .write_std           = kvm_write_guest_virt_system,
+ +      .read_std            = emulator_read_std,
+ +      .write_std           = emulator_write_std,
         .read_phys           = kvm_read_guest_phys_system,
         .fetch               = kvm_fetch_guest_virt,
         .read_emulated       = emulator_read_emulated,
@@@ -6646,7 -6619,7 +6648,7 @@@ static int kvm_pv_clock_pairing(struct 
                                 unsigned long clock_type)
   {
         struct kvm_clock_pairing clock_pairing;
- -      struct timespec ts;
+ +      struct timespec64 ts;
         u64 cycle;
         int ret;
   
@@@ -8567,7 -8540,7 +8569,7 @@@ int kvm_arch_hardware_setup(void
                 /*
                  * Make sure the user can only configure tsc_khz values that
                  * fit into a signed integer.
- -               * A min value is not calculated needed because it will always
+ +               * A min value is not calculated because it will always
                  * be 1 on all machines.
                  */
                 u64 max = min(0x7fffffffULL,
@@@ -8900,14 -8873,13 +8902,14 @@@ int kvm_arch_create_memslot(struct kvm 
                                       slot->base_gfn, level) + 1;
   
                 slot->arch.rmap[i] =
- -                      kvzalloc(lpages * sizeof(*slot->arch.rmap[i]), GFP_KERNEL);
+ +                      kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
+ +                               GFP_KERNEL);
                 if (!slot->arch.rmap[i])
                         goto out_free;
                 if (i == 0)
                         continue;
   
- -              linfo = kvzalloc(lpages * sizeof(*linfo), GFP_KERNEL);
+ +              linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
                 if (!linfo)
                         goto out_free;
   
diff --combined virt/kvm/eventfd.c

index 90d30fbe95aefb1e1a943d5bf29d7aee763fb9d0,fe6eb0fe07f6aaa962c8fe2445512687ccece03b..b20b751286fc612214c59c95e787c9fb0fac50b7
--- 1/virt/kvm/eventfd.c
--- 2/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@@ -119,8 -119,12 +119,12 @@@ irqfd_shutdown(struct work_struct *work
   {
         struct kvm_kernel_irqfd *irqfd =
                 container_of(work, struct kvm_kernel_irqfd, shutdown);
+       struct kvm *kvm = irqfd->kvm;
         u64 cnt;
   
+       /* Make sure irqfd has been initalized in assign path. */
+       synchronize_srcu(&kvm->irq_srcu);
+ 
         /*
          * Synchronize with the wait-queue and unhook ourselves to prevent
          * further events.
@@@ -387,7 -391,6 +391,6 @@@ kvm_irqfd_assign(struct kvm *kvm, struc
   
         idx = srcu_read_lock(&kvm->irq_srcu);
         irqfd_update(kvm, irqfd);
-       srcu_read_unlock(&kvm->irq_srcu, idx);
   
         list_add_tail(&irqfd->list, &kvm->irqfds.items);
   
@@@ -397,16 -400,11 +400,11 @@@
          * Check if there was an event already pending on the eventfd
          * before we registered, and trigger it as if we didn't miss it.
          */
- -      events = f.file->f_op->poll(f.file, &irqfd->pt);
+ +      events = vfs_poll(f.file, &irqfd->pt);
   
         if (events & EPOLLIN)
                 schedule_work(&irqfd->inject);
   
-       /*
-        * do not drop the file until the irqfd is fully initialized, otherwise
-        * we might race against the EPOLLHUP
-        */
-       fdput(f);
   #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
         if (kvm_arch_has_irq_bypass()) {
                 irqfd->consumer.token = (void *)irqfd->eventfd;
@@@ -421,6 -419,13 +419,13 @@@
         }
   #endif
   
+       srcu_read_unlock(&kvm->irq_srcu, idx);
+ 
+       /*
+        * do not drop the file until the irqfd is fully initialized, otherwise
+        * we might race against the EPOLLHUP
+        */
+       fdput(f);
         return 0;
   
   fail:
author	Linus Torvalds <[email protected]>
	Wed, 18 Jul 2018 18:08:44 +0000 (11:08 -0700)
committer	Linus Torvalds <[email protected]>
	Wed, 18 Jul 2018 18:08:44 +0000 (11:08 -0700)
		1	2
arch/x86/kernel/kvmclock.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/eventfd.c	patch \|	diff1 \|	diff2 \|	blob \| history