]> Git Repo - linux.git/commitdiff
Merge tag 'kvmarm-5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmar...
authorPaolo Bonzini <[email protected]>
Fri, 20 May 2022 11:16:27 +0000 (07:16 -0400)
committerPaolo Bonzini <[email protected]>
Wed, 25 May 2022 09:09:23 +0000 (05:09 -0400)
KVM/arm64 updates for 5.19

- Add support for the ARMv8.6 WFxT extension

- Guard pages for the EL2 stacks

- Trap and emulate AArch32 ID registers to hide unsupported features

- Ability to select and save/restore the set of hypercalls exposed
  to the guest

- Support for PSCI-initiated suspend in collaboration with userspace

- GICv3 register-based LPI invalidation support

- Move host PMU event merging into the vcpu data structure

- GICv3 ITS save/restore fixes

- The usual set of small-scale cleanups and fixes

[Due to the conflict, KVM_SYSTEM_EVENT_SEV_TERM is relocated
 from 4 to 6. - Paolo]

23 files changed:
1  2 
Documentation/virt/kvm/api.rst
arch/arm64/include/asm/kvm_host.h
arch/arm64/kvm/arm.c
arch/arm64/kvm/vgic/vgic-init.c
arch/x86/include/asm/kvm-x86-ops.h
arch/x86/include/asm/kvm_host.h
arch/x86/kernel/kvm.c
arch/x86/kvm/cpuid.c
arch/x86/kvm/pmu.h
arch/x86/kvm/svm/pmu.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/pmu_intel.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
include/linux/kvm_host.h
include/uapi/linux/kvm.h
tools/testing/selftests/kvm/.gitignore
tools/testing/selftests/kvm/Makefile
tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
virt/kvm/kvm_main.c

index 0c1b9f139e4a663bf572000f51b79e21ee2262ee,47c483d440f3aaee4ad67f2365320b9cdfe8b8c1..f9016fbb6b20e03e3389ead1b6ff34cd772467e5
@@@ -982,22 -982,12 +982,22 @@@ memory
        __u8 pad2[30];
    };
  
 -If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the
 -KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl.
 -This requests KVM to generate the contents of the hypercall page
 -automatically; hypercalls will be intercepted and passed to userspace
 -through KVM_EXIT_XEN.  In this case, all of the blob size and address
 -fields must be zero.
 +If certain flags are returned from the KVM_CAP_XEN_HVM check, they may
 +be set in the flags field of this ioctl:
 +
 +The KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag requests KVM to generate
 +the contents of the hypercall page automatically; hypercalls will be
 +intercepted and passed to userspace through KVM_EXIT_XEN.  In this
 +ase, all of the blob size and address fields must be zero.
 +
 +The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates to KVM that userspace
 +will always use the KVM_XEN_HVM_EVTCHN_SEND ioctl to deliver event
 +channel interrupts rather than manipulating the guest's shared_info
 +structures directly. This, in turn, may allow KVM to enable features
 +such as intercepting the SCHEDOP_poll hypercall to accelerate PV
 +spinlock operation for the guest. Userspace may still use the ioctl
 +to deliver events if it was advertised, even if userspace does not
 +send this indication that it will always do so
  
  No other flags are currently valid in the struct kvm_xen_hvm_config.
  
@@@ -1486,14 -1476,43 +1486,43 @@@ Possible values are
                                   [s390]
     KVM_MP_STATE_LOAD             the vcpu is in a special load/startup state
                                   [s390]
+    KVM_MP_STATE_SUSPENDED        the vcpu is in a suspend state and is waiting
+                                  for a wakeup event [arm64]
     ==========================    ===============================================
  
  On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
  in-kernel irqchip, the multiprocessing state must be maintained by userspace on
  these architectures.
  
- For arm64/riscv:
- ^^^^^^^^^^^^^^^^
+ For arm64:
+ ^^^^^^^^^^
+ If a vCPU is in the KVM_MP_STATE_SUSPENDED state, KVM will emulate the
+ architectural execution of a WFI instruction.
+ If a wakeup event is recognized, KVM will exit to userspace with a
+ KVM_SYSTEM_EVENT exit, where the event type is KVM_SYSTEM_EVENT_WAKEUP. If
+ userspace wants to honor the wakeup, it must set the vCPU's MP state to
+ KVM_MP_STATE_RUNNABLE. If it does not, KVM will continue to await a wakeup
+ event in subsequent calls to KVM_RUN.
+ .. warning::
+      If userspace intends to keep the vCPU in a SUSPENDED state, it is
+      strongly recommended that userspace take action to suppress the
+      wakeup event (such as masking an interrupt). Otherwise, subsequent
+      calls to KVM_RUN will immediately exit with a KVM_SYSTEM_EVENT_WAKEUP
+      event and inadvertently waste CPU cycles.
+      Additionally, if userspace takes action to suppress a wakeup event,
+      it is strongly recommended that it also restores the vCPU to its
+      original state when the vCPU is made RUNNABLE again. For example,
+      if userspace masked a pending interrupt to suppress the wakeup,
+      the interrupt should be unmasked before returning control to the
+      guest.
+ For riscv:
+ ^^^^^^^^^^
  
  The only states that are valid are KVM_MP_STATE_STOPPED and
  KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not.
@@@ -1897,25 -1916,22 +1926,25 @@@ the future
  4.55 KVM_SET_TSC_KHZ
  --------------------
  
 -:Capability: KVM_CAP_TSC_CONTROL
 +:Capability: KVM_CAP_TSC_CONTROL / KVM_CAP_VM_TSC_CONTROL
  :Architectures: x86
 -:Type: vcpu ioctl
 +:Type: vcpu ioctl / vm ioctl
  :Parameters: virtual tsc_khz
  :Returns: 0 on success, -1 on error
  
  Specifies the tsc frequency for the virtual machine. The unit of the
  frequency is KHz.
  
 +If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also
 +be used as a vm ioctl to set the initial tsc frequency of subsequently
 +created vCPUs.
  
  4.56 KVM_GET_TSC_KHZ
  --------------------
  
 -:Capability: KVM_CAP_GET_TSC_KHZ
 +:Capability: KVM_CAP_GET_TSC_KHZ / KVM_CAP_VM_TSC_CONTROL
  :Architectures: x86
 -:Type: vcpu ioctl
 +:Type: vcpu ioctl / vm ioctl
  :Parameters: none
  :Returns: virtual tsc-khz on success, negative value on error
  
@@@ -2614,6 -2630,24 +2643,24 @@@ EINVAL
  After the vcpu's SVE configuration is finalized, further attempts to
  write this register will fail with EPERM.
  
+ arm64 bitmap feature firmware pseudo-registers have the following bit pattern::
+   0x6030 0000 0016 <regno:16>
+ The bitmap feature firmware registers exposes the hypercall services that
+ are available for userspace to configure. The set bits corresponds to the
+ services that are available for the guests to access. By default, KVM
+ sets all the supported bits during VM initialization. The userspace can
+ discover the available services via KVM_GET_ONE_REG, and write back the
+ bitmap corresponding to the features that it wishes guests to see via
+ KVM_SET_ONE_REG.
+ Note: These registers are immutable once any of the vCPUs of the VM has
+ run at least once. A KVM_SET_ONE_REG in such a scenario will return
+ a -EBUSY to userspace.
+ (See Documentation/virt/kvm/arm/hypercalls.rst for more details.)
  
  MIPS registers are mapped using the lower 32 bits.  The upper 16 of that is
  the register group type:
@@@ -5229,25 -5263,7 +5276,25 @@@ have deterministic behavior
                struct {
                        __u64 gfn;
                } shared_info;
 -              __u64 pad[4];
 +              struct {
 +                      __u32 send_port;
 +                      __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
 +                      __u32 flags;
 +                      union {
 +                              struct {
 +                                      __u32 port;
 +                                      __u32 vcpu;
 +                                      __u32 priority;
 +                              } port;
 +                              struct {
 +                                      __u32 port; /* Zero for eventfd */
 +                                      __s32 fd;
 +                              } eventfd;
 +                              __u32 padding[4];
 +                      } deliver;
 +              } evtchn;
 +              __u32 xen_version;
 +              __u64 pad[8];
        } u;
    };
  
@@@ -5278,30 -5294,6 +5325,30 @@@ KVM_XEN_ATTR_TYPE_SHARED_INF
  
  KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
    Sets the exception vector used to deliver Xen event channel upcalls.
 +  This is the HVM-wide vector injected directly by the hypervisor
 +  (not through the local APIC), typically configured by a guest via
 +  HVM_PARAM_CALLBACK_IRQ.
 +
 +KVM_XEN_ATTR_TYPE_EVTCHN
 +  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
 +  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
 +  an outbound port number for interception of EVTCHNOP_send requests
 +  from the guest. A given sending port number may be directed back
 +  to a specified vCPU (by APIC ID) / port / priority on the guest,
 +  or to trigger events on an eventfd. The vCPU and priority can be
 +  changed by setting KVM_XEN_EVTCHN_UPDATE in a subsequent call,
 +  but other fields cannot change for a given sending port. A port
 +  mapping is removed by using KVM_XEN_EVTCHN_DEASSIGN in the flags
 +  field.
 +
 +KVM_XEN_ATTR_TYPE_XEN_VERSION
 +  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
 +  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
 +  the 32-bit version code returned to the guest when it invokes the
 +  XENVER_version call; typically (XEN_MAJOR << 16 | XEN_MINOR). PV
 +  Xen guests will often use this to as a dummy hypercall to trigger
 +  event channel delivery, so responding within the kernel without
 +  exiting to userspace is beneficial.
  
  4.127 KVM_XEN_HVM_GET_ATTR
  --------------------------
  :Returns: 0 on success, < 0 on error
  
  Allows Xen VM attributes to be read. For the structure and types,
 -see KVM_XEN_HVM_SET_ATTR above.
 +see KVM_XEN_HVM_SET_ATTR above. The KVM_XEN_ATTR_TYPE_EVTCHN
 +attribute cannot be read.
  
  4.128 KVM_XEN_VCPU_SET_ATTR
  ---------------------------
                        __u64 time_blocked;
                        __u64 time_offline;
                } runstate;
 +              __u32 vcpu_id;
 +              struct {
 +                      __u32 port;
 +                      __u32 priority;
 +                      __u64 expires_ns;
 +              } timer;
 +              __u8 vector;
        } u;
    };
  
@@@ -5389,27 -5373,6 +5436,27 @@@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUS
    or RUNSTATE_offline) to set the current accounted state as of the
    adjusted state_entry_time.
  
 +KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
 +  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
 +  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the Xen
 +  vCPU ID of the given vCPU, to allow timer-related VCPU operations to
 +  be intercepted by KVM.
 +
 +KVM_XEN_VCPU_ATTR_TYPE_TIMER
 +  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
 +  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
 +  event channel port/priority for the VIRQ_TIMER of the vCPU, as well
 +  as allowing a pending timer to be saved/restored.
 +
 +KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
 +  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
 +  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
 +  per-vCPU local APIC upcall vector, configured by a Xen guest with
 +  the HVMOP_set_evtchn_upcall_vector hypercall. This is typically
 +  used by Windows guests, and is distinct from the HVM-wide upcall
 +  vector configured with HVM_PARAM_CALLBACK_IRQ.
 +
 +
  4.129 KVM_XEN_VCPU_GET_ATTR
  ---------------------------
  
@@@ -5729,25 -5692,6 +5776,25 @@@ enabled with ``arch_prctl()``, but thi
  The offsets of the state save areas in struct kvm_xsave follow the contents
  of CPUID leaf 0xD on the host.
  
 +4.135 KVM_XEN_HVM_EVTCHN_SEND
 +-----------------------------
 +
 +:Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND
 +:Architectures: x86
 +:Type: vm ioctl
 +:Parameters: struct kvm_irq_routing_xen_evtchn
 +:Returns: 0 on success, < 0 on error
 +
 +
 +::
 +
 +   struct kvm_irq_routing_xen_evtchn {
 +      __u32 port;
 +      __u32 vcpu;
 +      __u32 priority;
 +   };
 +
 +This ioctl injects an event channel interrupt directly to the guest vCPU.
  
  5. The kvm_run structure
  ========================
@@@ -6088,7 -6032,8 +6135,9 @@@ should put the acknowledged interrupt v
    #define KVM_SYSTEM_EVENT_SHUTDOWN       1
    #define KVM_SYSTEM_EVENT_RESET          2
    #define KVM_SYSTEM_EVENT_CRASH          3
-   #define KVM_SYSTEM_EVENT_SEV_TERM       4
+   #define KVM_SYSTEM_EVENT_WAKEUP         4
+   #define KVM_SYSTEM_EVENT_SUSPEND        5
++  #define KVM_SYSTEM_EVENT_SEV_TERM       6
                        __u32 type;
                          __u32 ndata;
                          __u64 data[16];
@@@ -6113,8 -6058,37 +6162,39 @@@ Valid values for 'type' are
     has requested a crash condition maintenance. Userspace can choose
     to ignore the request, or to gather VM memory core dump and/or
     reset/shutdown of the VM.
 + - KVM_SYSTEM_EVENT_SEV_TERM -- an AMD SEV guest requested termination.
 +   The guest physical address of the guest's GHCB is stored in `data[0]`.
+  - KVM_SYSTEM_EVENT_WAKEUP -- the exiting vCPU is in a suspended state and
+    KVM has recognized a wakeup event. Userspace may honor this event by
+    marking the exiting vCPU as runnable, or deny it and call KVM_RUN again.
+  - KVM_SYSTEM_EVENT_SUSPEND -- the guest has requested a suspension of
+    the VM.
+ For arm/arm64:
+ --------------
+    KVM_SYSTEM_EVENT_SUSPEND exits are enabled with the
+    KVM_CAP_ARM_SYSTEM_SUSPEND VM capability. If a guest invokes the PSCI
+    SYSTEM_SUSPEND function, KVM will exit to userspace with this event
+    type.
+    It is the sole responsibility of userspace to implement the PSCI
+    SYSTEM_SUSPEND call according to ARM DEN0022D.b 5.19 "SYSTEM_SUSPEND".
+    KVM does not change the vCPU's state before exiting to userspace, so
+    the call parameters are left in-place in the vCPU registers.
+    Userspace is _required_ to take action for such an exit. It must
+    either:
+     - Honor the guest request to suspend the VM. Userspace can request
+       in-kernel emulation of suspension by setting the calling vCPU's
+       state to KVM_MP_STATE_SUSPENDED. Userspace must configure the vCPU's
+       state according to the parameters passed to the PSCI function when
+       the calling vCPU is resumed. See ARM DEN0022D.b 5.19.1 "Intended use"
+       for details on the function parameters.
+     - Deny the guest request to suspend the VM. See ARM DEN0022D.b 5.19.2
+       "Caller responsibilities" for possible return values.
  
  If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
  architecture specific information for the system-level event.  Only
@@@ -7251,15 -7225,6 +7331,15 @@@ The valid bits in cap.args[0] are
                                      Additionally, when this quirk is disabled,
                                      KVM clears CPUID.01H:ECX[bit 3] if
                                      IA32_MISC_ENABLE[bit 18] is cleared.
 +
 + KVM_X86_QUIRK_FIX_HYPERCALL_INSN   By default, KVM rewrites guest
 +                                    VMMCALL/VMCALL instructions to match the
 +                                    vendor's hypercall instruction for the
 +                                    system. When this quirk is disabled, KVM
 +                                    will no longer rewrite invalid guest
 +                                    hypercall instructions. Executing the
 +                                    incorrect hypercall instruction will
 +                                    generate a #UD within the guest.
  =================================== ============================================
  
  8. Other capabilities.
@@@ -7737,9 -7702,8 +7817,9 @@@ PVHVM guests. Valid flags are:
    #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR    (1 << 0)
    #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL  (1 << 1)
    #define KVM_XEN_HVM_CONFIG_SHARED_INFO      (1 << 2)
 -  #define KVM_XEN_HVM_CONFIG_RUNSTATE         (1 << 2)
 -  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL    (1 << 3)
 +  #define KVM_XEN_HVM_CONFIG_RUNSTATE         (1 << 3)
 +  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL    (1 << 4)
 +  #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND      (1 << 5)
  
  The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
  ioctl is available, for the guest to set its hypercall page.
@@@ -7763,14 -7727,6 +7843,14 @@@ The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL fl
  of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority
  field set to indicate 2 level event channel delivery.
  
 +The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates that KVM supports
 +injecting event channel events directly into the guest with the
 +KVM_XEN_HVM_EVTCHN_SEND ioctl. It also indicates support for the
 +KVM_XEN_ATTR_TYPE_EVTCHN/XEN_VERSION HVM attributes and the
 +KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes.
 +related to event channel delivery, timers, and the XENVER_version
 +interception.
 +
  8.31 KVM_CAP_PPC_MULTITCE
  -------------------------
  
@@@ -7858,6 -7814,16 +7938,16 @@@ At this time, KVM_PMU_CAP_DISABLE is th
  this capability will disable PMU virtualization for that VM.  Usermode
  should adjust CPUID leaf 0xA to reflect that the PMU is disabled.
  
+ 8.36 KVM_CAP_ARM_SYSTEM_SUSPEND
+ -------------------------------
+ :Capability: KVM_CAP_ARM_SYSTEM_SUSPEND
+ :Architectures: arm64
+ :Type: vm
+ When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of
+ type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request.
  9. Known KVM API problems
  =========================
  
index 27ebb2929e0cd3307d7f9ed6c87b52f5558e7612,026e91b8d00bdff3297b272d4483f308b865045d..78e718a1b23dd625df20eede69ed6b4ce48c415e
@@@ -46,6 -46,7 +46,7 @@@
  #define KVM_REQ_RECORD_STEAL  KVM_ARCH_REQ(3)
  #define KVM_REQ_RELOAD_GICv4  KVM_ARCH_REQ(4)
  #define KVM_REQ_RELOAD_PMU    KVM_ARCH_REQ(5)
+ #define KVM_REQ_SUSPEND               KVM_ARCH_REQ(6)
  
  #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
                                     KVM_DIRTY_LOG_INITIALLY_SET)
@@@ -101,12 -102,28 +102,25 @@@ struct kvm_s2_mmu 
  struct kvm_arch_memory_slot {
  };
  
+ /**
+  * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests
+  *
+  * @std_bmap: Bitmap of standard secure service calls
+  * @std_hyp_bmap: Bitmap of standard hypervisor service calls
+  * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls
+  */
+ struct kvm_smccc_features {
+       unsigned long std_bmap;
+       unsigned long std_hyp_bmap;
+       unsigned long vendor_hyp_bmap;
+ };
  struct kvm_arch {
        struct kvm_s2_mmu mmu;
  
        /* VTCR_EL2 value for this VM */
        u64    vtcr;
  
 -      /* The maximum number of vCPUs depends on the used GIC model */
 -      int max_vcpus;
 -
        /* Interrupt controller */
        struct vgic_dist        vgic;
  
         */
  #define KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED            3
  #define KVM_ARCH_FLAG_EL1_32BIT                               4
+       /* PSCI SYSTEM_SUSPEND enabled for the guest */
+ #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED          5
  
        unsigned long flags;
  
  
        u8 pfr0_csv2;
        u8 pfr0_csv3;
+       /* Hypercall features firmware registers' descriptor */
+       struct kvm_smccc_features smccc_feat;
  };
  
  struct kvm_vcpu_fault_info {
@@@ -251,14 -273,8 +270,8 @@@ struct kvm_cpu_context 
        struct kvm_vcpu *__hyp_running_vcpu;
  };
  
- struct kvm_pmu_events {
-       u32 events_host;
-       u32 events_guest;
- };
  struct kvm_host_data {
        struct kvm_cpu_context host_ctxt;
-       struct kvm_pmu_events pmu_events;
  };
  
  struct kvm_host_psci_config {
@@@ -292,8 -308,11 +305,11 @@@ struct vcpu_reset_state 
  
  struct kvm_vcpu_arch {
        struct kvm_cpu_context ctxt;
+       /* Guest floating point state */
        void *sve_state;
        unsigned int sve_max_vl;
+       u64 svcr;
  
        /* Stage 2 paging state used by the hardware on next switch */
        struct kvm_s2_mmu *hw_mmu;
                u32     mdscr_el1;
        } guest_debug_preserved;
  
-       /* vcpu power-off state */
-       bool power_off;
+       /* vcpu power state */
+       struct kvm_mp_state mp_state;
  
        /* Don't run the guest (internal implementation need) */
        bool pause;
  #define KVM_ARM64_DEBUG_STATE_SAVE_TRBE       (1 << 13) /* Save TRBE context if active  */
  #define KVM_ARM64_FP_FOREIGN_FPSTATE  (1 << 14)
  #define KVM_ARM64_ON_UNSUPPORTED_CPU  (1 << 15) /* Physical CPU not in supported_cpus */
+ #define KVM_ARM64_HOST_SME_ENABLED    (1 << 16) /* SME enabled for EL0 */
+ #define KVM_ARM64_WFIT                        (1 << 17) /* WFIT instruction trapped */
  
  #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
                                 KVM_GUESTDBG_USE_SW_BP | \
@@@ -680,10 -701,11 +698,11 @@@ int kvm_handle_cp14_64(struct kvm_vcpu 
  int kvm_handle_cp15_32(struct kvm_vcpu *vcpu);
  int kvm_handle_cp15_64(struct kvm_vcpu *vcpu);
  int kvm_handle_sys_reg(struct kvm_vcpu *vcpu);
+ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu);
  
  void kvm_reset_sys_regs(struct kvm_vcpu *vcpu);
  
void kvm_sys_reg_table_init(void);
int kvm_sys_reg_table_init(void);
  
  /* MMIO helpers */
  void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
@@@ -792,9 -814,6 +811,6 @@@ void kvm_arch_vcpu_put_debug_state_flag
  #ifdef CONFIG_KVM
  void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr);
  void kvm_clr_pmu_events(u32 clr);
- void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu);
- void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
  #else
  static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {}
  static inline void kvm_clr_pmu_events(u32 clr) {}
@@@ -826,8 -845,6 +842,6 @@@ bool kvm_arm_vcpu_is_finalized(struct k
  #define kvm_has_mte(kvm)                                      \
        (system_supports_mte() &&                               \
         test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags))
- #define kvm_vcpu_has_pmu(vcpu)                                        \
-       (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
  
  int kvm_trng_call(struct kvm_vcpu *vcpu);
  #ifdef CONFIG_KVM
@@@ -838,4 -855,7 +852,7 @@@ void __init kvm_hyp_reserve(void)
  static inline void kvm_hyp_reserve(void) { }
  #endif
  
+ void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
+ bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
  #endif /* __ARM64_KVM_HOST_H__ */
diff --combined arch/arm64/kvm/arm.c
index 7fceb855fa710d0859ee761c05859e8cf846a062,dcf691e3c72ff8780a7cec4443570f5669869883..807b2853b02a81482b195b4a7a7ce735ca3aad55
@@@ -97,6 -97,10 +97,10 @@@ int kvm_vm_ioctl_enable_cap(struct kvm 
                }
                mutex_unlock(&kvm->lock);
                break;
+       case KVM_CAP_ARM_SYSTEM_SUSPEND:
+               r = 0;
+               set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
+               break;
        default:
                r = -EINVAL;
                break;
@@@ -153,9 -157,10 +157,10 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
        kvm_vgic_early_init(kvm);
  
        /* The maximum number of VCPUs is limited by the host's GIC model */
 -      kvm->arch.max_vcpus = kvm_arm_default_max_vcpus();
 +      kvm->max_vcpus = kvm_arm_default_max_vcpus();
  
        set_default_spectre(kvm);
+       kvm_arm_init_hypercalls(kvm);
  
        return ret;
  out_free_stage2_pgd:
@@@ -210,6 -215,7 +215,7 @@@ int kvm_vm_ioctl_check_extension(struc
        case KVM_CAP_SET_GUEST_DEBUG:
        case KVM_CAP_VCPU_ATTRIBUTES:
        case KVM_CAP_PTP_KVM:
+       case KVM_CAP_ARM_SYSTEM_SUSPEND:
                r = 1;
                break;
        case KVM_CAP_SET_GUEST_DEBUG2:
        case KVM_CAP_MAX_VCPUS:
        case KVM_CAP_MAX_VCPU_ID:
                if (kvm)
 -                      r = kvm->arch.max_vcpus;
 +                      r = kvm->max_vcpus;
                else
                        r = kvm_arm_default_max_vcpus();
                break;
@@@ -306,7 -312,7 +312,7 @@@ int kvm_arch_vcpu_precreate(struct kvm 
        if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
                return -EBUSY;
  
 -      if (id >= kvm->arch.max_vcpus)
 +      if (id >= kvm->max_vcpus)
                return -EINVAL;
  
        return 0;
@@@ -356,11 -362,6 +362,6 @@@ void kvm_arch_vcpu_destroy(struct kvm_v
        kvm_arm_vcpu_destroy(vcpu);
  }
  
- int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
- {
-       return kvm_timer_is_pending(vcpu);
- }
  void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
  {
  
@@@ -432,20 -433,34 +433,34 @@@ void kvm_arch_vcpu_put(struct kvm_vcpu 
        vcpu->cpu = -1;
  }
  
static void vcpu_power_off(struct kvm_vcpu *vcpu)
void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
  {
-       vcpu->arch.power_off = true;
+       vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
        kvm_make_request(KVM_REQ_SLEEP, vcpu);
        kvm_vcpu_kick(vcpu);
  }
  
+ bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
+ {
+       return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED;
+ }
+ static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
+ {
+       vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED;
+       kvm_make_request(KVM_REQ_SUSPEND, vcpu);
+       kvm_vcpu_kick(vcpu);
+ }
+ static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
+ {
+       return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED;
+ }
  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
  {
-       if (vcpu->arch.power_off)
-               mp_state->mp_state = KVM_MP_STATE_STOPPED;
-       else
-               mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+       *mp_state = vcpu->arch.mp_state;
  
        return 0;
  }
@@@ -457,10 -472,13 +472,13 @@@ int kvm_arch_vcpu_ioctl_set_mpstate(str
  
        switch (mp_state->mp_state) {
        case KVM_MP_STATE_RUNNABLE:
-               vcpu->arch.power_off = false;
+               vcpu->arch.mp_state = *mp_state;
                break;
        case KVM_MP_STATE_STOPPED:
-               vcpu_power_off(vcpu);
+               kvm_arm_vcpu_power_off(vcpu);
+               break;
+       case KVM_MP_STATE_SUSPENDED:
+               kvm_arm_vcpu_suspend(vcpu);
                break;
        default:
                ret = -EINVAL;
@@@ -480,7 -498,7 +498,7 @@@ int kvm_arch_vcpu_runnable(struct kvm_v
  {
        bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
        return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
-               && !v->arch.power_off && !v->arch.pause);
+               && !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
  }
  
  bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
@@@ -592,15 -610,15 +610,15 @@@ void kvm_arm_resume_guest(struct kvm *k
        }
  }
  
- static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
+ static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu)
  {
        struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
  
        rcuwait_wait_event(wait,
-                          (!vcpu->arch.power_off) &&(!vcpu->arch.pause),
+                          (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
                           TASK_INTERRUPTIBLE);
  
-       if (vcpu->arch.power_off || vcpu->arch.pause) {
+       if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) {
                /* Awaken to handle a signal, request we sleep again later. */
                kvm_make_request(KVM_REQ_SLEEP, vcpu);
        }
@@@ -639,6 -657,7 +657,7 @@@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu
        preempt_enable();
  
        kvm_vcpu_halt(vcpu);
+       vcpu->arch.flags &= ~KVM_ARM64_WFIT;
        kvm_clear_request(KVM_REQ_UNHALT, vcpu);
  
        preempt_disable();
        preempt_enable();
  }
  
- static void check_vcpu_requests(struct kvm_vcpu *vcpu)
+ static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
+ {
+       if (!kvm_arm_vcpu_suspended(vcpu))
+               return 1;
+       kvm_vcpu_wfi(vcpu);
+       /*
+        * The suspend state is sticky; we do not leave it until userspace
+        * explicitly marks the vCPU as runnable. Request that we suspend again
+        * later.
+        */
+       kvm_make_request(KVM_REQ_SUSPEND, vcpu);
+       /*
+        * Check to make sure the vCPU is actually runnable. If so, exit to
+        * userspace informing it of the wakeup condition.
+        */
+       if (kvm_arch_vcpu_runnable(vcpu)) {
+               memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
+               vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
+               vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+               return 0;
+       }
+       /*
+        * Otherwise, we were unblocked to process a different event, such as a
+        * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
+        * process the event.
+        */
+       return 1;
+ }
+ /**
+  * check_vcpu_requests - check and handle pending vCPU requests
+  * @vcpu:     the VCPU pointer
+  *
+  * Return: 1 if we should enter the guest
+  *       0 if we should exit to userspace
+  *       < 0 if we should exit to userspace, where the return value indicates
+  *       an error
+  */
+ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
  {
        if (kvm_request_pending(vcpu)) {
                if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
-                       vcpu_req_sleep(vcpu);
+                       kvm_vcpu_sleep(vcpu);
  
                if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
                        kvm_reset_vcpu(vcpu);
                if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
                        kvm_pmu_handle_pmcr(vcpu,
                                            __vcpu_sys_reg(vcpu, PMCR_EL0));
+               if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
+                       return kvm_vcpu_suspend(vcpu);
        }
+       return 1;
  }
  
  static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
@@@ -791,7 -857,8 +857,8 @@@ int kvm_arch_vcpu_ioctl_run(struct kvm_
                if (!ret)
                        ret = 1;
  
-               check_vcpu_requests(vcpu);
+               if (ret > 0)
+                       ret = check_vcpu_requests(vcpu);
  
                /*
                 * Preparing the interrupts to be injected also
  
                kvm_vgic_flush_hwstate(vcpu);
  
+               kvm_pmu_update_vcpu_events(vcpu);
                /*
                 * Ensure we set mode to IN_GUEST_MODE after we disable
                 * interrupts and before the final VCPU requests check.
@@@ -1124,9 -1193,9 +1193,9 @@@ static int kvm_arch_vcpu_ioctl_vcpu_ini
         * Handle the "start in power-off" case.
         */
        if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
-               vcpu_power_off(vcpu);
+               kvm_arm_vcpu_power_off(vcpu);
        else
-               vcpu->arch.power_off = false;
+               vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
  
        return 0;
  }
@@@ -1483,7 -1552,6 +1552,6 @@@ static void cpu_prepare_hyp_mode(int cp
        tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET;
        params->tcr_el2 = tcr;
  
-       params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE);
        params->pgd_pa = kvm_mmu_get_httbr();
        if (is_protected_kvm_enabled())
                params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
@@@ -1761,8 -1829,6 +1829,6 @@@ static int init_subsystems(void
  
        kvm_register_perf_callbacks(NULL);
  
-       kvm_sys_reg_table_init();
  out:
        if (err || !is_protected_kvm_enabled())
                on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
@@@ -1933,14 -1999,46 +1999,46 @@@ static int init_hyp_mode(void
         * Map the Hyp stack pages
         */
        for_each_possible_cpu(cpu) {
+               struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
                char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-               err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
-                                         PAGE_HYP);
+               unsigned long hyp_addr;
+               /*
+                * Allocate a contiguous HYP private VA range for the stack
+                * and guard page. The allocation is also aligned based on
+                * the order of its size.
+                */
+               err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr);
+               if (err) {
+                       kvm_err("Cannot allocate hyp stack guard page\n");
+                       goto out_err;
+               }
  
+               /*
+                * Since the stack grows downwards, map the stack to the page
+                * at the higher address and leave the lower guard page
+                * unbacked.
+                *
+                * Any valid stack address now has the PAGE_SHIFT bit as 1
+                * and addresses corresponding to the guard page have the
+                * PAGE_SHIFT bit as 0 - this is used for overflow detection.
+                */
+               err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE,
+                                           __pa(stack_page), PAGE_HYP);
                if (err) {
                        kvm_err("Cannot map hyp stack\n");
                        goto out_err;
                }
+               /*
+                * Save the stack PA in nvhe_init_params. This will be needed
+                * to recreate the stack mapping in protected nVHE mode.
+                * __hyp_pa() won't do the right thing there, since the stack
+                * has been mapped in the flexible private VA space.
+                */
+               params->stack_pa = __pa(stack_page);
+               params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE);
        }
  
        for_each_possible_cpu(cpu) {
@@@ -2089,6 -2187,12 +2187,12 @@@ int kvm_arch_init(void *opaque
                return -ENODEV;
        }
  
+       err = kvm_sys_reg_table_init();
+       if (err) {
+               kvm_info("Error initializing system register tables");
+               return err;
+       }
        in_hyp_mode = is_kernel_in_hyp_mode();
  
        if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
index 77feafd5c0e3f806b0d51e3e4ca3f1774a099ff1,f84e04f334c68c0e6c37f4e1e2b1007aada3cdc3..f6d4f4052555c79aef58f3d7b87440dbe9c7c276
@@@ -98,11 -98,11 +98,11 @@@ int kvm_vgic_create(struct kvm *kvm, u3
        ret = 0;
  
        if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
 -              kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
 +              kvm->max_vcpus = VGIC_V2_MAX_CPUS;
        else
 -              kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
 +              kvm->max_vcpus = VGIC_V3_MAX_CPUS;
  
 -      if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
 +      if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) {
                ret = -E2BIG;
                goto out_unlock;
        }
@@@ -319,7 -319,12 +319,12 @@@ int vgic_init(struct kvm *kvm
  
        vgic_debug_init(kvm);
  
-       dist->implementation_rev = 2;
+       /*
+        * If userspace didn't set the GIC implementation revision,
+        * default to the latest and greatest. You know want it.
+        */
+       if (!dist->implementation_rev)
+               dist->implementation_rev = KVM_VGIC_IMP_REV_LATEST;
        dist->initialized = true;
  
  out:
index 96e4e9842dfc6aac0ccde9723da79d9ea82199b8,1a6d7e3f6c32c7f88917270b1f478608d6017830..da47f60a46509ff92bed7c433e5c4f660755e9ec
@@@ -118,6 -118,7 +118,7 @@@ KVM_X86_OP_OPTIONAL(mem_enc_register_re
  KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
  KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
  KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
+ KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
  KVM_X86_OP(get_msr_feature)
  KVM_X86_OP(can_emulate_instruction)
  KVM_X86_OP(apic_init_signal_blocked)
@@@ -126,7 -127,6 +127,7 @@@ KVM_X86_OP_OPTIONAL(migrate_timers
  KVM_X86_OP(msr_filter_changed)
  KVM_X86_OP(complete_emulated_msr)
  KVM_X86_OP(vcpu_deliver_sipi_vector)
 +KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
  
  #undef KVM_X86_OP
  #undef KVM_X86_OP_OPTIONAL
index c5fb4115176de9bc3d42ae49c868b7addd87a1c2,4ff36610af6ab5252d4956aa57d572bf45ee287a..959d66b9be94d0230445faa642fc41e380bf5552
@@@ -281,11 -281,11 +281,11 @@@ struct kvm_kernel_irq_routing_entry
  /*
   * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
   * also includes TDP pages) to determine whether or not a page can be used in
 - * the given MMU context.  This is a subset of the overall kvm_mmu_role to
 + * the given MMU context.  This is a subset of the overall kvm_cpu_role to
   * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
   * 2 bytes per gfn instead of 4 bytes per gfn.
   *
 - * Indirect upper-level shadow pages are tracked for write-protection via
 + * Upper-level shadow pages having gptes are tracked for write-protection via
   * gfn_track.  As above, gfn_track is a 16 bit counter, so KVM must not create
   * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise
   * gfn_track will overflow and explosions will ensure.
@@@ -331,8 -331,7 +331,8 @@@ union kvm_mmu_page_role 
                unsigned smap_andnot_wp:1;
                unsigned ad_disabled:1;
                unsigned guest_mode:1;
 -              unsigned :6;
 +              unsigned passthrough:1;
 +              unsigned :5;
  
                /*
                 * This is left at the top of the word so that
@@@ -368,6 -367,8 +368,6 @@@ union kvm_mmu_extended_role 
        struct {
                unsigned int valid:1;
                unsigned int execonly:1;
 -              unsigned int cr0_pg:1;
 -              unsigned int cr4_pae:1;
                unsigned int cr4_pse:1;
                unsigned int cr4_pke:1;
                unsigned int cr4_smap:1;
        };
  };
  
 -union kvm_mmu_role {
 +union kvm_cpu_role {
        u64 as_u64;
        struct {
                union kvm_mmu_page_role base;
@@@ -437,8 -438,19 +437,8 @@@ struct kvm_mmu 
                         struct kvm_mmu_page *sp);
        void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
        struct kvm_mmu_root_info root;
 -      union kvm_mmu_role mmu_role;
 -      u8 root_level;
 -      u8 shadow_root_level;
 -      u8 ept_ad;
 -      bool direct_map;
 -      struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
 -
 -      /*
 -       * Bitmap; bit set = permission fault
 -       * Byte index: page fault error code [4:1]
 -       * Bit index: pte permissions in ACC_* format
 -       */
 -      u8 permissions[16];
 +      union kvm_cpu_role cpu_role;
 +      union kvm_mmu_page_role root_role;
  
        /*
        * The pkru_mask indicates if protection key checks are needed.  It
        */
        u32 pkru_mask;
  
 +      struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
 +
 +      /*
 +       * Bitmap; bit set = permission fault
 +       * Byte index: page fault error code [4:1]
 +       * Bit index: pte permissions in ACC_* format
 +       */
 +      u8 permissions[16];
 +
        u64 *pae_root;
        u64 *pml4_root;
        u64 *pml5_root;
@@@ -604,21 -607,16 +604,21 @@@ struct kvm_vcpu_hv 
  struct kvm_vcpu_xen {
        u64 hypercall_rip;
        u32 current_runstate;
 -      bool vcpu_info_set;
 -      bool vcpu_time_info_set;
 -      bool runstate_set;
 -      struct gfn_to_hva_cache vcpu_info_cache;
 -      struct gfn_to_hva_cache vcpu_time_info_cache;
 -      struct gfn_to_hva_cache runstate_cache;
 +      u8 upcall_vector;
 +      struct gfn_to_pfn_cache vcpu_info_cache;
 +      struct gfn_to_pfn_cache vcpu_time_info_cache;
 +      struct gfn_to_pfn_cache runstate_cache;
        u64 last_steal;
        u64 runstate_entry_time;
        u64 runstate_times[4];
        unsigned long evtchn_pending_sel;
 +      u32 vcpu_id; /* The Xen / ACPI vCPU ID */
 +      u32 timer_virq;
 +      u64 timer_expires; /* In guest epoch */
 +      atomic_t timer_pending;
 +      struct hrtimer timer;
 +      int poll_evtchn;
 +      struct timer_list poll_timer;
  };
  
  struct kvm_vcpu_arch {
        gpa_t time;
        struct pvclock_vcpu_time_info hv_clock;
        unsigned int hw_tsc_khz;
 -      struct gfn_to_hva_cache pv_time;
 -      bool pv_time_enabled;
 +      struct gfn_to_pfn_cache pv_time;
        /* set guest stopped flag in pvclock flags field */
        bool pvclock_set_guest_stopped_request;
  
@@@ -1025,12 -1024,9 +1025,12 @@@ struct msr_bitmap_range 
  
  /* Xen emulation context */
  struct kvm_xen {
 +      u32 xen_version;
        bool long_mode;
        u8 upcall_vector;
        struct gfn_to_pfn_cache shinfo_cache;
 +      struct idr evtchn_ports;
 +      unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
  };
  
  enum kvm_irqchip_mode {
@@@ -1123,8 -1119,6 +1123,8 @@@ struct kvm_arch 
        u64 cur_tsc_generation;
        int nr_vcpus_matched_tsc;
  
 +      u32 default_tsc_khz;
 +
        seqcount_raw_spinlock_t pvclock_sc;
        bool use_master_clock;
        u64 master_kernel_ns;
@@@ -1269,12 -1263,7 +1269,12 @@@ struct kvm_vm_stat 
  
  struct kvm_vcpu_stat {
        struct kvm_vcpu_stat_generic generic;
 +      u64 pf_taken;
        u64 pf_fixed;
 +      u64 pf_emulate;
 +      u64 pf_spurious;
 +      u64 pf_fast;
 +      u64 pf_mmio_spte_created;
        u64 pf_guest;
        u64 tlb_flush;
        u64 invlpg;
@@@ -1466,6 -1455,8 +1466,6 @@@ struct kvm_x86_ops 
        int cpu_dirty_log_size;
        void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
  
 -      /* pmu operations of sub-arch */
 -      const struct kvm_pmu_ops *pmu_ops;
        const struct kvm_x86_nested_ops *nested_ops;
  
        void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
        int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
        int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
        int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
+       void (*guest_memory_reclaimed)(struct kvm *kvm);
  
        int (*get_msr_feature)(struct kvm_msr_entry *entry);
  
        int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
  
        void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
 +
 +      /*
 +       * Returns vCPU specific APICv inhibit reasons
 +       */
 +      unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
  };
  
  struct kvm_x86_nested_ops {
        void (*leave_nested)(struct kvm_vcpu *vcpu);
        int (*check_events)(struct kvm_vcpu *vcpu);
 +      bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu,
 +                                           struct x86_exception *fault);
        bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
        void (*triple_fault)(struct kvm_vcpu *vcpu);
        int (*get_state)(struct kvm_vcpu *vcpu,
@@@ -1543,7 -1528,6 +1544,7 @@@ struct kvm_x86_init_ops 
        unsigned int (*handle_intel_pt_intr)(void);
  
        struct kvm_x86_ops *runtime_ops;
 +      struct kvm_pmu_ops *pmu_ops;
  };
  
  struct kvm_arch_async_pf {
@@@ -1565,6 -1549,20 +1566,6 @@@ extern struct kvm_x86_ops kvm_x86_ops
  #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
  #include <asm/kvm-x86-ops.h>
  
 -static inline void kvm_ops_static_call_update(void)
 -{
 -#define __KVM_X86_OP(func) \
 -      static_call_update(kvm_x86_##func, kvm_x86_ops.func);
 -#define KVM_X86_OP(func) \
 -      WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
 -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
 -#define KVM_X86_OP_OPTIONAL_RET0(func) \
 -      static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
 -                                         (void *)__static_call_return0);
 -#include <asm/kvm-x86-ops.h>
 -#undef __KVM_X86_OP
 -}
 -
  #define __KVM_HAVE_ARCH_VM_ALLOC
  static inline struct kvm *kvm_arch_alloc_vm(void)
  {
@@@ -1802,7 -1800,6 +1803,7 @@@ gpa_t kvm_mmu_gva_to_gpa_system(struct 
                                struct x86_exception *exception);
  
  bool kvm_apicv_activated(struct kvm *kvm);
 +bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
  void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
  void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
                                      enum kvm_apicv_inhibit reason, bool set);
@@@ -1992,7 -1989,6 +1993,7 @@@ int memslot_rmap_alloc(struct kvm_memor
         KVM_X86_QUIRK_CD_NW_CLEARED |          \
         KVM_X86_QUIRK_LAPIC_MMIO_HOLE |        \
         KVM_X86_QUIRK_OUT_7E_INC_RIP |         \
 -       KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)
 +       KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT |   \
 +       KVM_X86_QUIRK_FIX_HYPERCALL_INSN)
  
  #endif /* _ASM_X86_KVM_HOST_H */
diff --combined arch/x86/kernel/kvm.c
index d0bb2b3fb305f92e1254152764dc5c7e785cea1c,8b1c45c9cda8771a446aed8b4a62849dda08e77e..b48ce07756ca83a5ab05e99d8e996c91b270d095
@@@ -69,6 -69,7 +69,7 @@@ static DEFINE_PER_CPU_DECRYPTED(struct 
  DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
  static int has_steal_clock = 0;
  
+ static int has_guest_poll = 0;
  /*
   * No need for any "IO delay" on KVM
   */
@@@ -706,14 -707,26 +707,26 @@@ static int kvm_cpu_down_prepare(unsigne
  
  static int kvm_suspend(void)
  {
+       u64 val = 0;
        kvm_guest_cpu_offline(false);
  
+ #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+       if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+               rdmsrl(MSR_KVM_POLL_CONTROL, val);
+       has_guest_poll = !(val & 1);
+ #endif
        return 0;
  }
  
  static void kvm_resume(void)
  {
        kvm_cpu_online(raw_smp_processor_id());
+ #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+       if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
+               wrmsrl(MSR_KVM_POLL_CONTROL, 0);
+ #endif
  }
  
  static struct syscore_ops kvm_syscore_ops = {
@@@ -752,42 -765,6 +765,42 @@@ static void kvm_crash_shutdown(struct p
  }
  #endif
  
 +#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
 +bool __kvm_vcpu_is_preempted(long cpu);
 +
 +__visible bool __kvm_vcpu_is_preempted(long cpu)
 +{
 +      struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
 +
 +      return !!(src->preempted & KVM_VCPU_PREEMPTED);
 +}
 +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 +
 +#else
 +
 +#include <asm/asm-offsets.h>
 +
 +extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
 +
 +/*
 + * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
 + * restoring to/from the stack.
 + */
 +asm(
 +".pushsection .text;"
 +".global __raw_callee_save___kvm_vcpu_is_preempted;"
 +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
 +"__raw_callee_save___kvm_vcpu_is_preempted:"
 +ASM_ENDBR
 +"movq __per_cpu_offset(,%rdi,8), %rax;"
 +"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
 +"setne        %al;"
 +ASM_RET
 +".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
 +".popsection");
 +
 +#endif
 +
  static void __init kvm_guest_init(void)
  {
        int i;
        if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
                has_steal_clock = 1;
                static_call_update(pv_steal_clock, kvm_steal_clock);
 +
 +              pv_ops.lock.vcpu_is_preempted =
 +                      PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
        }
  
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@@ -1044,6 -1018,40 +1057,6 @@@ static void kvm_wait(u8 *ptr, u8 val
        }
  }
  
 -#ifdef CONFIG_X86_32
 -__visible bool __kvm_vcpu_is_preempted(long cpu)
 -{
 -      struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
 -
 -      return !!(src->preempted & KVM_VCPU_PREEMPTED);
 -}
 -PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 -
 -#else
 -
 -#include <asm/asm-offsets.h>
 -
 -extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
 -
 -/*
 - * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
 - * restoring to/from the stack.
 - */
 -asm(
 -".pushsection .text;"
 -".global __raw_callee_save___kvm_vcpu_is_preempted;"
 -".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
 -"__raw_callee_save___kvm_vcpu_is_preempted:"
 -ASM_ENDBR
 -"movq __per_cpu_offset(,%rdi,8), %rax;"
 -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
 -"setne        %al;"
 -ASM_RET
 -".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
 -".popsection");
 -
 -#endif
 -
  /*
   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
   */
@@@ -1087,6 -1095,10 +1100,6 @@@ void __init kvm_spinlock_init(void
        pv_ops.lock.wait = kvm_wait;
        pv_ops.lock.kick = kvm_kick_cpu;
  
 -      if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 -              pv_ops.lock.vcpu_is_preempted =
 -                      PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
 -      }
        /*
         * When PV spinlock is enabled which is preferred over
         * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
diff --combined arch/x86/kvm/cpuid.c
index 732724ea5b100936ec46639df6e236858a354d7f,598334ed5fbc8db78fb6ee5e2492d6a47884a339..0c1ba6aa07651f4d2698b004c35c27aa1b406118
@@@ -887,11 -887,6 +887,11 @@@ static inline int __do_cpuid_func(struc
                union cpuid10_eax eax;
                union cpuid10_edx edx;
  
 +              if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
 +                      entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 +                      break;
 +              }
 +
                perf_get_x86_pmu_capability(&cap);
  
                /*
        case 0x80000000:
                entry->eax = min(entry->eax, 0x80000021);
                /*
-                * Serializing LFENCE is reported in a multitude of ways,
-                * and NullSegClearsBase is not reported in CPUID on Zen2;
-                * help userspace by providing the CPUID leaf ourselves.
+                * Serializing LFENCE is reported in a multitude of ways, and
+                * NullSegClearsBase is not reported in CPUID on Zen2; help
+                * userspace by providing the CPUID leaf ourselves.
+                *
+                * However, only do it if the host has CPUID leaf 0x8000001d.
+                * QEMU thinks that it can query the host blindly for that
+                * CPUID leaf if KVM reports that it supports 0x8000001d or
+                * above.  The processor merrily returns values from the
+                * highest Intel leaf which QEMU tries to use as the guest's
+                * 0x8000001d.  Even worse, this can result in an infinite
+                * loop if said highest leaf has no subleaves indexed by ECX.
                 */
-               if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
-                   || !static_cpu_has_bug(X86_BUG_NULL_SEG))
+               if (entry->eax >= 0x8000001d &&
+                   (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
+                    || !static_cpu_has_bug(X86_BUG_NULL_SEG)))
                        entry->eax = max(entry->eax, 0x80000021);
                break;
        case 0x80000001:
diff --combined arch/x86/kvm/pmu.h
index 2a53b6c9495cac2f05eb59a3b6ff83da6cb40394,22992b049d380f55f36660e5d866943f8856fddc..e745f443b6a8fac72e4ff85c11067fad300f9a30
@@@ -39,8 -39,6 +39,8 @@@ struct kvm_pmu_ops 
        void (*cleanup)(struct kvm_vcpu *vcpu);
  };
  
 +void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
 +
  static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
  {
        struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@@ -88,6 -86,11 +88,6 @@@ static inline bool pmc_is_fixed(struct 
        return pmc->type == KVM_PMC_FIXED;
  }
  
 -static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
 -{
 -      return kvm_x86_ops.pmu_ops->pmc_is_enabled(pmc);
 -}
 -
  static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
                                                 u64 data)
  {
@@@ -135,6 -138,15 +135,15 @@@ static inline u64 get_sample_period(str
        return sample_period;
  }
  
+ static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
+ {
+       if (!pmc->perf_event || pmc->is_paused)
+               return;
+       perf_event_period(pmc->perf_event,
+                         get_sample_period(pmc, pmc->counter));
+ }
  void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
  void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
  void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
diff --combined arch/x86/kvm/svm/pmu.c
index 47e8eaca1e90666831e72e09115490c28c51a393,b14860863c39417e3ab196c5e54c9b6e392eac19..136039fc6d0101c9f69150056f0b885c11ddb030
@@@ -45,22 -45,6 +45,22 @@@ static struct kvm_event_hw_type_mappin
        [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
  };
  
 +/* duplicated from amd_f17h_perfmon_event_map. */
 +static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = {
 +      [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
 +      [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
 +      [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES },
 +      [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES },
 +      [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
 +      [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
 +      [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
 +      [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
 +};
 +
 +/* amd_pmc_perf_hw_id depends on these being the same size */
 +static_assert(ARRAY_SIZE(amd_event_mapping) ==
 +           ARRAY_SIZE(amd_f17h_event_mapping));
 +
  static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
  {
        struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
@@@ -156,7 -140,6 +156,7 @@@ static inline struct kvm_pmc *get_gp_pm
  
  static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
  {
 +      struct kvm_event_hw_type_mapping *event_mapping;
        u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
        u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
        int i;
        if (WARN_ON(pmc_is_fixed(pmc)))
                return PERF_COUNT_HW_MAX;
  
 +      if (guest_cpuid_family(pmc->vcpu) >= 0x17)
 +              event_mapping = amd_f17h_event_mapping;
 +      else
 +              event_mapping = amd_event_mapping;
 +
        for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
 -              if (amd_event_mapping[i].eventsel == event_select
 -                  && amd_event_mapping[i].unit_mask == unit_mask)
 +              if (event_mapping[i].eventsel == event_select
 +                  && event_mapping[i].unit_mask == unit_mask)
                        break;
  
        if (i == ARRAY_SIZE(amd_event_mapping))
                return PERF_COUNT_HW_MAX;
  
 -      return amd_event_mapping[i].event_type;
 +      return event_mapping[i].event_type;
  }
  
  /* check if a PMC is enabled by comparing it against global_ctrl bits. Because
@@@ -279,6 -257,7 +279,7 @@@ static int amd_pmu_set_msr(struct kvm_v
        pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
        if (pmc) {
                pmc->counter += data - pmc_read_counter(pmc);
+               pmc_update_sample_period(pmc);
                return 0;
        }
        /* MSR_EVNTSELn */
@@@ -341,7 -320,7 +342,7 @@@ static void amd_pmu_reset(struct kvm_vc
        }
  }
  
 -struct kvm_pmu_ops amd_pmu_ops = {
 +struct kvm_pmu_ops amd_pmu_ops __initdata = {
        .pmc_perf_hw_id = amd_pmc_perf_hw_id,
        .pmc_is_enabled = amd_pmc_is_enabled,
        .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
diff --combined arch/x86/kvm/svm/sev.c
index b67ce873d5d2e96dcf0be8e6ff2b83e4ca2a0172,0ad70c12c7c311d605c7820c1e9f223415d9e319..94d62c9958b9259525c50d4024e96e299762e4eb
@@@ -2226,51 -2226,47 +2226,47 @@@ int sev_cpu_init(struct svm_cpu_data *s
   * Pages used by hardware to hold guest encrypted state must be flushed before
   * returning them to the system.
   */
- static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va,
-                                  unsigned long len)
+ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
  {
+       int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
        /*
-        * If hardware enforced cache coherency for encrypted mappings of the
-        * same physical page is supported, nothing to do.
+        * Note!  The address must be a kernel address, as regular page walk
+        * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
+        * address is non-deterministic and unsafe.  This function deliberately
+        * takes a pointer to deter passing in a user address.
         */
-       if (boot_cpu_has(X86_FEATURE_SME_COHERENT))
-               return;
+       unsigned long addr = (unsigned long)va;
  
        /*
-        * If the VM Page Flush MSR is supported, use it to flush the page
-        * (using the page virtual address and the guest ASID).
+        * If CPU enforced cache coherency for encrypted mappings of the
+        * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
+        * flush is still needed in order to work properly with DMA devices.
         */
-       if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) {
-               struct kvm_sev_info *sev;
-               unsigned long va_start;
-               u64 start, stop;
-               /* Align start and stop to page boundaries. */
-               va_start = (unsigned long)va;
-               start = (u64)va_start & PAGE_MASK;
-               stop = PAGE_ALIGN((u64)va_start + len);
-               if (start < stop) {
-                       sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+       if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
+               clflush_cache_range(va, PAGE_SIZE);
+               return;
+       }
  
-                       while (start < stop) {
-                               wrmsrl(MSR_AMD64_VM_PAGE_FLUSH,
-                                      start | sev->asid);
+       /*
+        * VM Page Flush takes a host virtual address and a guest ASID.  Fall
+        * back to WBINVD if this faults so as not to make any problems worse
+        * by leaving stale encrypted data in the cache.
+        */
+       if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
+               goto do_wbinvd;
  
-                               start += PAGE_SIZE;
-                       }
+       return;
  
-                       return;
-               }
+ do_wbinvd:
+       wbinvd_on_all_cpus();
+ }
  
-               WARN(1, "Address overflow, using WBINVD\n");
-       }
+ void sev_guest_memory_reclaimed(struct kvm *kvm)
+ {
+       if (!sev_guest(kvm))
+               return;
  
-       /*
-        * Hardware should always have one of the above features,
-        * but if not, use WBINVD and issue a warning.
-        */
-       WARN_ONCE(1, "Using WBINVD to flush guest memory\n");
        wbinvd_on_all_cpus();
  }
  
@@@ -2284,7 -2280,8 +2280,8 @@@ void sev_free_vcpu(struct kvm_vcpu *vcp
        svm = to_svm(vcpu);
  
        if (vcpu->arch.guest_state_protected)
-               sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE);
+               sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
        __free_page(virt_to_page(svm->sev_es.vmsa));
  
        if (svm->sev_es.ghcb_sa_free)
@@@ -2738,12 -2735,8 +2735,12 @@@ static int sev_handle_vmgexit_msr_proto
                pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
                        reason_set, reason_code);
  
 -              ret = -EINVAL;
 -              break;
 +              vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 +              vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
 +              vcpu->run->system_event.ndata = 1;
 +              vcpu->run->system_event.data[0] = control->ghcb_gpa;
 +
 +              return 0;
        }
        default:
                /* Error, keep GHCB MSR value as-is */
@@@ -2926,14 -2919,6 +2923,14 @@@ void sev_es_init_vmcb(struct vcpu_svm *
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 +
 +      if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) &&
 +          (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) ||
 +           guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) {
 +              set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1);
 +              if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP))
 +                      svm_clr_intercept(svm, INTERCEPT_RDTSCP);
 +      }
  }
  
  void sev_es_vcpu_reset(struct vcpu_svm *svm)
diff --combined arch/x86/kvm/svm/svm.c
index 3b49337998ec9c7acab9be31ffa5a25ca385af8e,7e45d03cd018a5cc354936fcebc5b14d43c2cbcc..63880b33ce370801eda69e9758b69cf838c06df2
@@@ -62,6 -62,8 +62,6 @@@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id)
  #define SEG_TYPE_LDT 2
  #define SEG_TYPE_BUSY_TSS16 3
  
 -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 -
  static bool erratum_383_found __read_mostly;
  
  u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
@@@ -99,7 -101,6 +99,7 @@@ static const struct svm_direct_access_m
        { .index = MSR_EFER,                            .always = false },
        { .index = MSR_IA32_CR_PAT,                     .always = false },
        { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
 +      { .index = MSR_TSC_AUX,                         .always = false },
        { .index = MSR_INVALID,                         .always = false },
  };
  
@@@ -171,7 -172,7 +171,7 @@@ static int vls = true
  module_param(vls, int, 0444);
  
  /* enable/disable Virtual GIF */
 -static int vgif = true;
 +int vgif = true;
  module_param(vgif, int, 0444);
  
  /* enable/disable LBR virtualization */
@@@ -188,9 -189,6 +188,9 @@@ module_param(tsc_scaling, int, 0444)
  static bool avic;
  module_param(avic, bool, 0444);
  
 +static bool force_avic;
 +module_param_unsafe(force_avic, bool, 0444);
 +
  bool __read_mostly dump_invalid_vmcb;
  module_param(dump_invalid_vmcb, bool, 0644);
  
@@@ -792,17 -790,6 +792,17 @@@ static void init_msrpm_offsets(void
        }
  }
  
 +void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 +{
 +      to_vmcb->save.dbgctl            = from_vmcb->save.dbgctl;
 +      to_vmcb->save.br_from           = from_vmcb->save.br_from;
 +      to_vmcb->save.br_to             = from_vmcb->save.br_to;
 +      to_vmcb->save.last_excp_from    = from_vmcb->save.last_excp_from;
 +      to_vmcb->save.last_excp_to      = from_vmcb->save.last_excp_to;
 +
 +      vmcb_mark_dirty(to_vmcb, VMCB_LBR);
 +}
 +
  static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 +
 +      /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
 +      if (is_guest_mode(vcpu))
 +              svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
  }
  
  static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 +
 +      /*
 +       * Move the LBR msrs back to the vmcb01 to avoid copying them
 +       * on nested guest entries.
 +       */
 +      if (is_guest_mode(vcpu))
 +              svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
 +}
 +
 +static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
 +{
 +      /*
 +       * If the LBR virtualization is disabled, the LBR msrs are always
 +       * kept in the vmcb01 to avoid copying them on nested guest entries.
 +       *
 +       * If nested, and the LBR virtualization is enabled/disabled, the msrs
 +       * are moved between the vmcb01 and vmcb02 as needed.
 +       */
 +      struct vmcb *vmcb =
 +              (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
 +                      svm->vmcb : svm->vmcb01.ptr;
 +
 +      switch (index) {
 +      case MSR_IA32_DEBUGCTLMSR:
 +              return vmcb->save.dbgctl;
 +      case MSR_IA32_LASTBRANCHFROMIP:
 +              return vmcb->save.br_from;
 +      case MSR_IA32_LASTBRANCHTOIP:
 +              return vmcb->save.br_to;
 +      case MSR_IA32_LASTINTFROMIP:
 +              return vmcb->save.last_excp_from;
 +      case MSR_IA32_LASTINTTOIP:
 +              return vmcb->save.last_excp_to;
 +      default:
 +              KVM_BUG(false, svm->vcpu.kvm,
 +                      "%s: Unknown MSR 0x%x", __func__, index);
 +              return 0;
 +      }
 +}
 +
 +void svm_update_lbrv(struct kvm_vcpu *vcpu)
 +{
 +      struct vcpu_svm *svm = to_svm(vcpu);
 +
 +      bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
 +                                         DEBUGCTLMSR_LBR;
 +
 +      bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
 +                                    LBR_CTL_ENABLE_MASK);
 +
 +      if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
 +              if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
 +                      enable_lbrv = true;
 +
 +      if (enable_lbrv == current_enable_lbrv)
 +              return;
 +
 +      if (enable_lbrv)
 +              svm_enable_lbrv(vcpu);
 +      else
 +              svm_disable_lbrv(vcpu);
  }
  
  void disable_nmi_singlestep(struct vcpu_svm *svm)
@@@ -909,9 -831,6 +909,9 @@@ static void grow_ple_window(struct kvm_
        struct vmcb_control_area *control = &svm->vmcb->control;
        int old = control->pause_filter_count;
  
 +      if (kvm_pause_in_guest(vcpu->kvm) || !old)
 +              return;
 +
        control->pause_filter_count = __grow_ple_window(old,
                                                        pause_filter_count,
                                                        pause_filter_count_grow,
@@@ -930,9 -849,6 +930,9 @@@ static void shrink_ple_window(struct kv
        struct vmcb_control_area *control = &svm->vmcb->control;
        int old = control->pause_filter_count;
  
 +      if (kvm_pause_in_guest(vcpu->kvm) || !old)
 +              return;
 +
        control->pause_filter_count =
                                __shrink_ple_window(old,
                                                    pause_filter_count,
@@@ -1044,8 -960,6 +1044,8 @@@ static inline void init_vmcb_after_set_
  
                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
 +
 +              svm->v_vmload_vmsave_enabled = false;
        } else {
                /*
                 * If hardware supports Virtual VMLOAD VMSAVE then enable it
  static void init_vmcb(struct kvm_vcpu *vcpu)
  {
        struct vcpu_svm *svm = to_svm(vcpu);
 -      struct vmcb_control_area *control = &svm->vmcb->control;
 -      struct vmcb_save_area *save = &svm->vmcb->save;
 +      struct vmcb *vmcb = svm->vmcb01.ptr;
 +      struct vmcb_control_area *control = &vmcb->control;
 +      struct vmcb_save_area *save = &vmcb->save;
  
        svm_set_intercept(svm, INTERCEPT_CR0_READ);
        svm_set_intercept(svm, INTERCEPT_CR3_READ);
                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
  
        if (kvm_vcpu_apicv_active(vcpu))
 -              avic_init_vmcb(svm);
 +              avic_init_vmcb(svm, vmcb);
  
        if (vgif) {
                svm_clr_intercept(svm, INTERCEPT_STGI);
                }
        }
  
 -      svm_hv_init_vmcb(svm->vmcb);
 +      svm_hv_init_vmcb(vmcb);
        init_vmcb_after_set_cpuid(vcpu);
  
 -      vmcb_mark_all_dirty(svm->vmcb);
 +      vmcb_mark_all_dirty(vmcb);
  
        enable_gif(svm);
  }
@@@ -1467,7 -1380,7 +1467,7 @@@ static void svm_set_vintr(struct vcpu_s
        /*
         * The following fields are ignored when AVIC is enabled
         */
 -      WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
 +      WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
  
        svm_set_intercept(svm, INTERCEPT_VINTR);
  
@@@ -2229,7 -2142,7 +2229,7 @@@ void svm_set_gif(struct vcpu_svm *svm, 
                 * Likewise, clear the VINTR intercept, we will set it
                 * again while processing KVM_REQ_EVENT if needed.
                 */
 -              if (vgif_enabled(svm))
 +              if (vgif)
                        svm_clr_intercept(svm, INTERCEPT_STGI);
                if (svm_is_intercept(svm, INTERCEPT_VINTR))
                        svm_clear_vintr(svm);
                 * in use, we still rely on the VINTR intercept (rather than
                 * STGI) to detect an open interrupt window.
                */
 -              if (!vgif_enabled(svm))
 +              if (!vgif)
                        svm_clear_vintr(svm);
        }
  }
@@@ -2662,12 -2575,25 +2662,12 @@@ static int svm_get_msr(struct kvm_vcpu 
        case MSR_TSC_AUX:
                msr_info->data = svm->tsc_aux;
                break;
 -      /*
 -       * Nobody will change the following 5 values in the VMCB so we can
 -       * safely return them on rdmsr. They will always be 0 until LBRV is
 -       * implemented.
 -       */
        case MSR_IA32_DEBUGCTLMSR:
 -              msr_info->data = svm->vmcb->save.dbgctl;
 -              break;
        case MSR_IA32_LASTBRANCHFROMIP:
 -              msr_info->data = svm->vmcb->save.br_from;
 -              break;
        case MSR_IA32_LASTBRANCHTOIP:
 -              msr_info->data = svm->vmcb->save.br_to;
 -              break;
        case MSR_IA32_LASTINTFROMIP:
 -              msr_info->data = svm->vmcb->save.last_excp_from;
 -              break;
        case MSR_IA32_LASTINTTOIP:
 -              msr_info->data = svm->vmcb->save.last_excp_to;
 +              msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
                break;
        case MSR_VM_HSAVE_PA:
                msr_info->data = svm->nested.hsave_msr;
@@@ -2913,13 -2839,12 +2913,13 @@@ static int svm_set_msr(struct kvm_vcpu 
                if (data & DEBUGCTL_RESERVED_BITS)
                        return 1;
  
 -              svm->vmcb->save.dbgctl = data;
 -              vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
 -              if (data & (1ULL<<0))
 -                      svm_enable_lbrv(vcpu);
 +              if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
 +                      svm->vmcb->save.dbgctl = data;
                else
 -                      svm_disable_lbrv(vcpu);
 +                      svm->vmcb01.ptr->save.dbgctl = data;
 +
 +              svm_update_lbrv(vcpu);
 +
                break;
        case MSR_VM_HSAVE_PA:
                /*
@@@ -2976,16 -2901,9 +2976,16 @@@ static int interrupt_window_interceptio
        svm_clear_vintr(to_svm(vcpu));
  
        /*
 -       * For AVIC, the only reason to end up here is ExtINTs.
 +       * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
         * In this case AVIC was temporarily disabled for
         * requesting the IRQ window and we have to re-enable it.
 +       *
 +       * If running nested, still remove the VM wide AVIC inhibit to
 +       * support case in which the interrupt window was requested when the
 +       * vCPU was not running nested.
 +
 +       * All vCPUs which run still run nested, will remain to have their
 +       * AVIC still inhibited due to per-cpu AVIC inhibition.
         */
        kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
  
  static int pause_interception(struct kvm_vcpu *vcpu)
  {
        bool in_kernel;
 -
        /*
         * CPL is not made available for an SEV-ES guest, therefore
         * vcpu->arch.preempted_in_kernel can never be true.  Just
         */
        in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
  
 -      if (!kvm_pause_in_guest(vcpu->kvm))
 -              grow_ple_window(vcpu);
 +      grow_ple_window(vcpu);
  
        kvm_vcpu_on_spin(vcpu, in_kernel);
        return kvm_skip_emulated_instruction(vcpu);
@@@ -3576,20 -3496,14 +3576,20 @@@ static void svm_enable_irq_window(struc
         * enabled, the STGI interception will not occur. Enable the irq
         * window under the assumption that the hardware will set the GIF.
         */
 -      if (vgif_enabled(svm) || gif_set(svm)) {
 +      if (vgif || gif_set(svm)) {
                /*
                 * IRQ window is not needed when AVIC is enabled,
                 * unless we have pending ExtINT since it cannot be injected
 -               * via AVIC. In such case, we need to temporarily disable AVIC,
 +               * via AVIC. In such case, KVM needs to temporarily disable AVIC,
                 * and fallback to injecting IRQ via V_IRQ.
 +               *
 +               * If running nested, AVIC is already locally inhibited
 +               * on this vCPU, therefore there is no need to request
 +               * the VM wide AVIC inhibition.
                 */
 -              kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
 +              if (!is_guest_mode(vcpu))
 +                      kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
 +
                svm_set_vintr(svm);
        }
  }
@@@ -3602,7 -3516,7 +3602,7 @@@ static void svm_enable_nmi_window(struc
                return; /* IRET will cause a vm exit */
  
        if (!gif_set(svm)) {
 -              if (vgif_enabled(svm))
 +              if (vgif)
                        svm_set_intercept(svm, INTERCEPT_STGI);
                return; /* STGI will cause a vm exit */
        }
@@@ -3951,7 -3865,7 +3951,7 @@@ static void svm_load_mmu_pgd(struct kvm
                hv_track_root_tdp(vcpu, root_hpa);
  
                cr3 = vcpu->arch.cr3;
 -      } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
 +      } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) {
                cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
        } else {
                /* PCID in the guest should be impossible with a 32-bit MMU. */
@@@ -4032,17 -3946,6 +4032,17 @@@ static void svm_vcpu_after_set_cpuid(st
                             guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
  
        svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
 +      svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
 +
 +      svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
 +
 +      svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
 +                      guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
 +
 +      svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
 +                      guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
 +
 +      svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
  
        svm_recalc_instruction_intercepts(vcpu, svm);
  
                 */
                if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
                        kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
 -
 -              /*
 -               * Currently, AVIC does not work with nested virtualization.
 -               * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
 -               */
 -              if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
 -                      kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED);
        }
        init_vmcb_after_set_cpuid(vcpu);
  }
@@@ -4314,7 -4224,7 +4314,7 @@@ static int svm_enter_smm(struct kvm_vcp
        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
  
 -      ret = nested_svm_vmexit(svm);
 +      ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
        if (ret)
                return ret;
  
@@@ -4411,7 -4321,7 +4411,7 @@@ static void svm_enable_smi_window(struc
        struct vcpu_svm *svm = to_svm(vcpu);
  
        if (!gif_set(svm)) {
 -              if (vgif_enabled(svm))
 +              if (vgif)
                        svm_set_intercept(svm, INTERCEPT_STGI);
                /* STGI will cause a vm exit */
        } else {
@@@ -4695,6 -4605,7 +4695,6 @@@ static struct kvm_x86_ops svm_x86_ops _
  
        .sched_in = svm_sched_in,
  
 -      .pmu_ops = &amd_pmu_ops,
        .nested_ops = &svm_nested_ops,
  
        .deliver_interrupt = svm_deliver_interrupt,
        .mem_enc_ioctl = sev_mem_enc_ioctl,
        .mem_enc_register_region = sev_mem_enc_register_region,
        .mem_enc_unregister_region = sev_mem_enc_unregister_region,
+       .guest_memory_reclaimed = sev_guest_memory_reclaimed,
  
        .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
        .vm_move_enc_context_from = sev_vm_move_enc_context_from,
        .complete_emulated_msr = svm_complete_emulated_msr,
  
        .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
 +      .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
  };
  
  /*
@@@ -4785,20 -4696,6 +4786,20 @@@ static __init void svm_set_cpu_caps(voi
                if (tsc_scaling)
                        kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
  
 +              if (vls)
 +                      kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
 +              if (lbrv)
 +                      kvm_cpu_cap_set(X86_FEATURE_LBRV);
 +
 +              if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
 +                      kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
 +
 +              if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
 +                      kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
 +
 +              if (vgif)
 +                      kvm_cpu_cap_set(X86_FEATURE_VGIF);
 +
                /* Nested VM can receive #VMEXIT instead of triggering #GP */
                kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
        }
@@@ -4892,9 -4789,6 +4893,9 @@@ static __init int svm_hardware_setup(vo
                          get_npt_level(), PG_LEVEL_1G);
        pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
  
 +      /* Setup shadow_me_value and shadow_me_mask */
 +      kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
 +
        /* Note, SEV setup consumes npt_enabled. */
        sev_hardware_setup();
  
                        nrips = false;
        }
  
 -      enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
 +      enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
  
        if (enable_apicv) {
 -              pr_info("AVIC enabled\n");
 +              if (!boot_cpu_has(X86_FEATURE_AVIC)) {
 +                      pr_warn("AVIC is not supported in CPUID but force enabled");
 +                      pr_warn("Your system might crash and burn");
 +              } else
 +                      pr_info("AVIC enabled\n");
  
                amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
        } else {
                svm_x86_ops.vcpu_blocking = NULL;
                svm_x86_ops.vcpu_unblocking = NULL;
 +              svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
        }
  
        if (vls) {
@@@ -4991,7 -4880,6 +4992,7 @@@ static struct kvm_x86_init_ops svm_init
        .check_processor_compatibility = svm_check_processor_compat,
  
        .runtime_ops = &svm_x86_ops,
 +      .pmu_ops = &amd_pmu_ops,
  };
  
  static int __init svm_init(void)
diff --combined arch/x86/kvm/svm/svm.h
index 32220a1b0ea207cb0b7dd29bf4ef5ce2491b3884,f76deff71002cbbd3403f43faf7a773bfb14280d..45a87b2a8b3c47637a27eb0049ba411c22cfa5a9
  #define       IOPM_SIZE PAGE_SIZE * 3
  #define       MSRPM_SIZE PAGE_SIZE * 2
  
 -#define MAX_DIRECT_ACCESS_MSRS        20
 +#define MAX_DIRECT_ACCESS_MSRS        21
  #define MSRPM_OFFSETS 16
  extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  extern bool npt_enabled;
 +extern int vgif;
  extern bool intercept_smi;
  
  /*
@@@ -232,14 -231,9 +232,14 @@@ struct vcpu_svm 
        unsigned int3_injected;
        unsigned long int3_rip;
  
 -      /* cached guest cpuid flags for faster access */
 +      /* optional nested SVM features that are enabled for this guest  */
        bool nrips_enabled                : 1;
        bool tsc_scaling_enabled          : 1;
 +      bool v_vmload_vmsave_enabled      : 1;
 +      bool lbrv_enabled                 : 1;
 +      bool pause_filter_enabled         : 1;
 +      bool pause_threshold_enabled      : 1;
 +      bool vgif_enabled                 : 1;
  
        u32 ldr_reg;
        u32 dfr_reg;
@@@ -458,70 -452,44 +458,70 @@@ static inline bool svm_is_intercept(str
        return vmcb_is_intercept(&svm->vmcb->control, bit);
  }
  
 -static inline bool vgif_enabled(struct vcpu_svm *svm)
 +static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
  {
 -      return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
 +      return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
 +}
 +
 +static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
 +{
 +      if (!vgif)
 +              return NULL;
 +
 +      if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm))
 +              return svm->nested.vmcb02.ptr;
 +      else
 +              return svm->vmcb01.ptr;
  }
  
  static inline void enable_gif(struct vcpu_svm *svm)
  {
 -      if (vgif_enabled(svm))
 -              svm->vmcb->control.int_ctl |= V_GIF_MASK;
 +      struct vmcb *vmcb = get_vgif_vmcb(svm);
 +
 +      if (vmcb)
 +              vmcb->control.int_ctl |= V_GIF_MASK;
        else
                svm->vcpu.arch.hflags |= HF_GIF_MASK;
  }
  
  static inline void disable_gif(struct vcpu_svm *svm)
  {
 -      if (vgif_enabled(svm))
 -              svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
 +      struct vmcb *vmcb = get_vgif_vmcb(svm);
 +
 +      if (vmcb)
 +              vmcb->control.int_ctl &= ~V_GIF_MASK;
        else
                svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
  }
  
  static inline bool gif_set(struct vcpu_svm *svm)
  {
 -      if (vgif_enabled(svm))
 -              return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
 +      struct vmcb *vmcb = get_vgif_vmcb(svm);
 +
 +      if (vmcb)
 +              return !!(vmcb->control.int_ctl & V_GIF_MASK);
        else
                return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
  }
  
 +static inline bool nested_npt_enabled(struct vcpu_svm *svm)
 +{
 +      return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
 +}
 +
  /* svm.c */
  #define MSR_INVALID                           0xffffffffU
  
 +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 +
  extern bool dump_invalid_vmcb;
  
  u32 svm_msrpm_offset(u32 msr);
  u32 *svm_vcpu_alloc_msrpm(void);
  void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
  void svm_vcpu_free_msrpm(u32 *msrpm);
 +void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
 +void svm_update_lbrv(struct kvm_vcpu *vcpu);
  
  int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
  void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
@@@ -606,7 -574,7 +606,7 @@@ extern struct kvm_x86_nested_ops svm_ne
  int avic_ga_log_notifier(u32 ga_tag);
  void avic_vm_destroy(struct kvm *kvm);
  int avic_vm_init(struct kvm *kvm);
 -void avic_init_vmcb(struct vcpu_svm *svm);
 +void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
  int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
  int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
  int avic_init_vcpu(struct vcpu_svm *svm);
@@@ -624,7 -592,6 +624,7 @@@ int avic_pi_update_irte(struct kvm *kvm
  void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
  void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
  void avic_ring_doorbell(struct kvm_vcpu *vcpu);
 +unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
  
  /* sev.c */
  
@@@ -642,6 -609,8 +642,8 @@@ int sev_mem_enc_unregister_region(struc
                                  struct kvm_enc_region *range);
  int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
  int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+ void sev_guest_memory_reclaimed(struct kvm *kvm);
  void pre_sev_run(struct vcpu_svm *svm, int cpu);
  void __init sev_set_cpu_caps(void);
  void __init sev_hardware_setup(void);
index a6688663da4d37798a01ed5e7db16ff2f0440b60,856c87563883302e8ee4da0eec769fd8e0f99908..f5cb18e00e789259b591c677ad5ddac18a3ce621
@@@ -476,23 -476,24 +476,23 @@@ static int nested_vmx_check_exception(s
        return 0;
  }
  
 -
 -static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 -              struct x86_exception *fault)
 +static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
 +                                                  struct x86_exception *fault)
  {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
  
        WARN_ON(!is_guest_mode(vcpu));
  
        if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
 -              !to_vmx(vcpu)->nested.nested_run_pending) {
 +          !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
                vmcs12->vm_exit_intr_error_code = fault->error_code;
                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
                                  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
                                  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
                                  fault->address);
 -      } else {
 -              kvm_inject_page_fault(vcpu, fault);
 +              return true;
        }
 +      return false;
  }
  
  static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
@@@ -2613,6 -2614,9 +2613,6 @@@ static int prepare_vmcs02(struct kvm_vc
                vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
        }
  
 -      if (!enable_ept)
 -              vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
 -
        if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
            WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
                                     vmcs12->guest_ia32_perf_global_ctrl))) {
@@@ -3691,34 -3695,12 +3691,34 @@@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu
  }
  
  static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
 -                                    struct vmcs12 *vmcs12)
 +                                    struct vmcs12 *vmcs12,
 +                                    u32 vm_exit_reason, u32 exit_intr_info)
  {
        u32 idt_vectoring;
        unsigned int nr;
  
 -      if (vcpu->arch.exception.injected) {
 +      /*
 +       * Per the SDM, VM-Exits due to double and triple faults are never
 +       * considered to occur during event delivery, even if the double/triple
 +       * fault is the result of an escalating vectoring issue.
 +       *
 +       * Note, the SDM qualifies the double fault behavior with "The original
 +       * event results in a double-fault exception".  It's unclear why the
 +       * qualification exists since exits due to double fault can occur only
 +       * while vectoring a different exception (injected events are never
 +       * subject to interception), i.e. there's _always_ an original event.
 +       *
 +       * The SDM also uses NMI as a confusing example for the "original event
 +       * causes the VM exit directly" clause.  NMI isn't special in any way,
 +       * the same rule applies to all events that cause an exit directly.
 +       * NMI is an odd choice for the example because NMIs can only occur on
 +       * instruction boundaries, i.e. they _can't_ occur during vectoring.
 +       */
 +      if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
 +          ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
 +           is_double_fault(exit_intr_info))) {
 +              vmcs12->idt_vectoring_info_field = 0;
 +      } else if (vcpu->arch.exception.injected) {
                nr = vcpu->arch.exception.nr;
                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
  
                        idt_vectoring |= INTR_TYPE_EXT_INTR;
  
                vmcs12->idt_vectoring_info_field = idt_vectoring;
 +      } else {
 +              vmcs12->idt_vectoring_info_field = 0;
        }
  }
  
@@@ -4222,12 -4202,12 +4222,12 @@@ static void prepare_vmcs12(struct kvm_v
        if (to_vmx(vcpu)->exit_reason.enclave_mode)
                vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
        vmcs12->exit_qualification = exit_qualification;
 -      vmcs12->vm_exit_intr_info = exit_intr_info;
 -
 -      vmcs12->idt_vectoring_info_field = 0;
 -      vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 -      vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
  
 +      /*
 +       * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
 +       * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
 +       * exit info fields are unmodified.
 +       */
        if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
                vmcs12->launch_state = 1;
  
                 * Transfer the event that L0 or L1 may wanted to inject into
                 * L2 to IDT_VECTORING_INFO_FIELD.
                 */
 -              vmcs12_save_pending_event(vcpu, vmcs12);
 +              vmcs12_save_pending_event(vcpu, vmcs12,
 +                                        vm_exit_reason, exit_intr_info);
 +
 +              vmcs12->vm_exit_intr_info = exit_intr_info;
 +              vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 +              vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
  
                /*
                 * According to spec, there's no need to store the guest's
@@@ -4543,6 -4518,9 +4543,6 @@@ void nested_vmx_vmexit(struct kvm_vcpu 
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);
  
 -      /* Similarly, triple faults in L2 should never escape. */
 -      WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
 -
        if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
                /*
                 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
                kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
        }
  
+       if (vmx->nested.update_vmcs01_apicv_status) {
+               vmx->nested.update_vmcs01_apicv_status = false;
+               kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+       }
        if ((vm_exit_reason != -1) &&
            (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
                vmx->nested.need_vmcs12_to_shadow_sync = true;
@@@ -6826,7 -6809,6 +6831,7 @@@ __init int nested_vmx_hardware_setup(in
  struct kvm_x86_nested_ops vmx_nested_ops = {
        .leave_nested = vmx_leave_nested,
        .check_events = vmx_check_nested_events,
 +      .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
        .hv_timer_pending = nested_vmx_preemption_timer_pending,
        .triple_fault = nested_vmx_triple_fault,
        .get_state = vmx_get_nested_state,
index 9db662399487f78c73c943ed898143713de96121,b82b6709d7a819090bb28106c0b70362fea5d7a1..37e9eb32e3d90211fba088c6ba8397f8385c42e1
@@@ -431,15 -431,11 +431,11 @@@ static int intel_pmu_set_msr(struct kvm
                            !(msr & MSR_PMC_FULL_WIDTH_BIT))
                                data = (s64)(s32)data;
                        pmc->counter += data - pmc_read_counter(pmc);
-                       if (pmc->perf_event && !pmc->is_paused)
-                               perf_event_period(pmc->perf_event,
-                                                 get_sample_period(pmc, data));
+                       pmc_update_sample_period(pmc);
                        return 0;
                } else if ((pmc = get_fixed_pmc(pmu, msr))) {
                        pmc->counter += data - pmc_read_counter(pmc);
-                       if (pmc->perf_event && !pmc->is_paused)
-                               perf_event_period(pmc->perf_event,
-                                                 get_sample_period(pmc, data));
+                       pmc_update_sample_period(pmc);
                        return 0;
                } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
                        if (data == pmc->eventsel)
@@@ -723,7 -719,7 +719,7 @@@ static void intel_pmu_cleanup(struct kv
                intel_pmu_release_guest_lbr_event(vcpu);
  }
  
 -struct kvm_pmu_ops intel_pmu_ops = {
 +struct kvm_pmu_ops intel_pmu_ops __initdata = {
        .pmc_perf_hw_id = intel_pmc_perf_hw_id,
        .pmc_is_enabled = intel_pmc_is_enabled,
        .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
diff --combined arch/x86/kvm/vmx/vmx.c
index 5fb37e75fd3159868062a8f7de4648244f4cb60f,d58b763df855f6dfaaa761e0d1a4a7c8ccc12d1f..cbbcf97d9e666642ff587512ccc853134bc46c9a
@@@ -2444,7 -2444,7 +2444,7 @@@ static __init int setup_vmcs_config(str
                                &_cpu_based_exec_control) < 0)
                return -EIO;
  #ifdef CONFIG_X86_64
 -      if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
 +      if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)
                _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
                                           ~CPU_BASED_CR8_STORE_EXITING;
  #endif
@@@ -2948,7 -2948,7 +2948,7 @@@ static void vmx_flush_tlb_current(struc
  
        if (enable_ept)
                ept_sync_context(construct_eptp(vcpu, root_hpa,
 -                                              mmu->shadow_root_level));
 +                                              mmu->root_role.level));
        else
                vpid_sync_context(vmx_get_current_vpid(vcpu));
  }
@@@ -4174,6 -4174,11 +4174,11 @@@ static void vmx_refresh_apicv_exec_ctrl
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
  
+       if (is_guest_mode(vcpu)) {
+               vmx->nested.update_vmcs01_apicv_status = true;
+               return;
+       }
        pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
        if (cpu_has_secondary_exec_ctrls()) {
                if (kvm_vcpu_apicv_active(vcpu))
@@@ -4380,7 -4385,7 +4385,7 @@@ static void init_vmcs(struct vcpu_vmx *
        if (cpu_has_secondary_exec_ctrls())
                secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
  
 -      if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
 +      if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
                vmcs_write64(EOI_EXIT_BITMAP0, 0);
                vmcs_write64(EOI_EXIT_BITMAP1, 0);
                vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@@ -5405,7 -5410,9 +5410,7 @@@ static int handle_ept_violation(struct 
        error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
                      ? PFERR_FETCH_MASK : 0;
        /* ept page table entry is present? */
 -      error_code |= (exit_qualification &
 -                     (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
 -                      EPT_VIOLATION_EXECUTABLE))
 +      error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
                      ? PFERR_PRESENT_MASK : 0;
  
        error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
@@@ -7816,6 -7823,7 +7821,6 @@@ static struct kvm_x86_ops vmx_x86_ops _
        .cpu_dirty_log_size = PML_ENTITY_NUM,
        .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
  
 -      .pmu_ops = &intel_pmu_ops,
        .nested_ops = &vmx_nested_ops,
  
        .pi_update_irte = vmx_pi_update_irte,
@@@ -7883,31 -7891,6 +7888,31 @@@ static __init void vmx_setup_user_retur
                kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
  }
  
 +static void __init vmx_setup_me_spte_mask(void)
 +{
 +      u64 me_mask = 0;
 +
 +      /*
 +       * kvm_get_shadow_phys_bits() returns shadow_phys_bits.  Use
 +       * the former to avoid exposing shadow_phys_bits.
 +       *
 +       * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
 +       * shadow_phys_bits.  On MKTME and/or TDX capable systems,
 +       * boot_cpu_data.x86_phys_bits holds the actual physical address
 +       * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
 +       * reported by CPUID.  Those bits between are KeyID bits.
 +       */
 +      if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
 +              me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
 +                      kvm_get_shadow_phys_bits() - 1);
 +      /*
 +       * Unlike SME, host kernel doesn't support setting up any
 +       * MKTME KeyID on Intel platforms.  No memory encryption
 +       * bits should be included into the SPTE.
 +       */
 +      kvm_mmu_set_me_spte_mask(0, me_mask);
 +}
 +
  static struct kvm_x86_init_ops vmx_init_ops __initdata;
  
  static __init int hardware_setup(void)
                kvm_mmu_set_ept_masks(enable_ept_ad_bits,
                                      cpu_has_vmx_ept_execute_only());
  
 +      /*
 +       * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
 +       * bits to shadow_zero_check.
 +       */
 +      vmx_setup_me_spte_mask();
 +
        kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
                          ept_caps_to_lpage_level(vmx_capability.ept));
  
@@@ -8100,7 -8077,6 +8105,7 @@@ static struct kvm_x86_init_ops vmx_init
        .handle_intel_pt_intr = NULL,
  
        .runtime_ops = &vmx_x86_ops,
 +      .pmu_ops = &intel_pmu_ops,
  };
  
  static void vmx_cleanup_l1d_flush(void)
diff --combined arch/x86/kvm/x86.c
index 8b62c9b7795efade6fc1b9499164c33f2a59f2c9,4790f0d7d40b84293ef7f6d7e5a689c645c5a352..bc507d6414f46d692aef6e3f1b7a841a3d0fdb61
@@@ -266,12 -266,7 +266,12 @@@ const struct kvm_stats_header kvm_vm_st
  
  const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
        KVM_GENERIC_VCPU_STATS(),
 +      STATS_DESC_COUNTER(VCPU, pf_taken),
        STATS_DESC_COUNTER(VCPU, pf_fixed),
 +      STATS_DESC_COUNTER(VCPU, pf_emulate),
 +      STATS_DESC_COUNTER(VCPU, pf_spurious),
 +      STATS_DESC_COUNTER(VCPU, pf_fast),
 +      STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created),
        STATS_DESC_COUNTER(VCPU, pf_guest),
        STATS_DESC_COUNTER(VCPU, tlb_flush),
        STATS_DESC_COUNTER(VCPU, invlpg),
@@@ -753,7 -748,6 +753,7 @@@ void kvm_inject_page_fault(struct kvm_v
  }
  EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
  
 +/* Returns true if the page fault was immediately morphed into a VM-Exit. */
  bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
                                    struct x86_exception *fault)
  {
                kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
                                       fault_mmu->root.hpa);
  
 +      /*
 +       * A workaround for KVM's bad exception handling.  If KVM injected an
 +       * exception into L2, and L2 encountered a #PF while vectoring the
 +       * injected exception, manually check to see if L1 wants to intercept
 +       * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
 +       * In all other cases, defer the check to nested_ops->check_events(),
 +       * which will correctly handle priority (this does not).  Note, other
 +       * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
 +       * most problematic, e.g. when L0 and L1 are both intercepting #PF for
 +       * shadow paging.
 +       *
 +       * TODO: Rewrite exception handling to track injected and pending
 +       *       (VM-Exit) exceptions separately.
 +       */
 +      if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
 +          kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
 +              return true;
 +
        fault_mmu->inject_page_fault(vcpu, fault);
 -      return fault->nested_page_fault;
 +      return false;
  }
  EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
  
@@@ -985,13 -961,11 +985,13 @@@ void kvm_load_guest_xsave_state(struct 
                        wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
        }
  
 +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (static_cpu_has(X86_FEATURE_PKU) &&
 -          (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
 -           (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
 -          vcpu->arch.pkru != vcpu->arch.host_pkru)
 +          vcpu->arch.pkru != vcpu->arch.host_pkru &&
 +          ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
 +           kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
                write_pkru(vcpu->arch.pkru);
 +#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
  }
  EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
  
@@@ -1000,15 -974,13 +1000,15 @@@ void kvm_load_host_xsave_state(struct k
        if (vcpu->arch.guest_state_protected)
                return;
  
 +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (static_cpu_has(X86_FEATURE_PKU) &&
 -          (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
 -           (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
 +          ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
 +           kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
                vcpu->arch.pkru = rdpkru();
                if (vcpu->arch.pkru != vcpu->arch.host_pkru)
                        write_pkru(vcpu->arch.host_pkru);
        }
 +#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
  
        if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
  
@@@ -2277,13 -2249,14 +2277,13 @@@ static void kvm_write_system_time(struc
        kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
  
        /* we verify if the enable bit is set... */
 -      vcpu->arch.pv_time_enabled = false;
 -      if (!(system_time & 1))
 -              return;
 -
 -      if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
 -                                     &vcpu->arch.pv_time, system_time & ~1ULL,
 -                                     sizeof(struct pvclock_vcpu_time_info)))
 -              vcpu->arch.pv_time_enabled = true;
 +      if (system_time & 1) {
 +              kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
 +                                        KVM_HOST_USES_PFN, system_time & ~1ULL,
 +                                        sizeof(struct pvclock_vcpu_time_info));
 +      } else {
 +              kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
 +      }
  
        return;
  }
@@@ -2988,55 -2961,63 +2988,55 @@@ u64 get_kvmclock_ns(struct kvm *kvm
        return data.clock;
  }
  
 -static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
 -                                 struct gfn_to_hva_cache *cache,
 -                                 unsigned int offset)
 +static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
 +                                  struct gfn_to_pfn_cache *gpc,
 +                                  unsigned int offset)
  {
        struct kvm_vcpu_arch *vcpu = &v->arch;
 -      struct pvclock_vcpu_time_info guest_hv_clock;
 +      struct pvclock_vcpu_time_info *guest_hv_clock;
 +      unsigned long flags;
  
 -      if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
 -              &guest_hv_clock, offset, sizeof(guest_hv_clock))))
 -              return;
 +      read_lock_irqsave(&gpc->lock, flags);
 +      while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
 +                                         offset + sizeof(*guest_hv_clock))) {
 +              read_unlock_irqrestore(&gpc->lock, flags);
 +
 +              if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
 +                                               offset + sizeof(*guest_hv_clock)))
 +                      return;
  
 -      /* This VCPU is paused, but it's legal for a guest to read another
 +              read_lock_irqsave(&gpc->lock, flags);
 +      }
 +
 +      guest_hv_clock = (void *)(gpc->khva + offset);
 +
 +      /*
 +       * This VCPU is paused, but it's legal for a guest to read another
         * VCPU's kvmclock, so we really have to follow the specification where
         * it says that version is odd if data is being modified, and even after
         * it is consistent.
 -       *
 -       * Version field updates must be kept separate.  This is because
 -       * kvm_write_guest_cached might use a "rep movs" instruction, and
 -       * writes within a string instruction are weakly ordered.  So there
 -       * are three writes overall.
 -       *
 -       * As a small optimization, only write the version field in the first
 -       * and third write.  The vcpu->pv_time cache is still valid, because the
 -       * version field is the first in the struct.
         */
 -      BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
 -
 -      if (guest_hv_clock.version & 1)
 -              ++guest_hv_clock.version;  /* first time write, random junk */
 -
 -      vcpu->hv_clock.version = guest_hv_clock.version + 1;
 -      kvm_write_guest_offset_cached(v->kvm, cache,
 -                                    &vcpu->hv_clock, offset,
 -                                    sizeof(vcpu->hv_clock.version));
  
 +      guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
        smp_wmb();
  
        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
 -      vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
 +      vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
  
        if (vcpu->pvclock_set_guest_stopped_request) {
                vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
                vcpu->pvclock_set_guest_stopped_request = false;
        }
  
 -      trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 +      memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
 +      smp_wmb();
  
 -      kvm_write_guest_offset_cached(v->kvm, cache,
 -                                    &vcpu->hv_clock, offset,
 -                                    sizeof(vcpu->hv_clock));
 +      guest_hv_clock->version = ++vcpu->hv_clock.version;
  
 -      smp_wmb();
 +      mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
 +      read_unlock_irqrestore(&gpc->lock, flags);
  
 -      vcpu->hv_clock.version++;
 -      kvm_write_guest_offset_cached(v->kvm, cache,
 -                                   &vcpu->hv_clock, offset,
 -                                   sizeof(vcpu->hv_clock.version));
 +      trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
  }
  
  static int kvm_guest_time_update(struct kvm_vcpu *v)
  
        vcpu->hv_clock.flags = pvclock_flags;
  
 -      if (vcpu->pv_time_enabled)
 -              kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
 -      if (vcpu->xen.vcpu_info_set)
 -              kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
 -                                     offsetof(struct compat_vcpu_info, time));
 -      if (vcpu->xen.vcpu_time_info_set)
 -              kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
 +      if (vcpu->pv_time.active)
 +              kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
 +      if (vcpu->xen.vcpu_info_cache.active)
 +              kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
 +                                      offsetof(struct compat_vcpu_info, time));
 +      if (vcpu->xen.vcpu_time_info_cache.active)
 +              kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
        kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
        return 0;
  }
@@@ -3319,7 -3300,7 +3319,7 @@@ static int kvm_pv_enable_async_pf_int(s
  
  static void kvmclock_reset(struct kvm_vcpu *vcpu)
  {
 -      vcpu->arch.pv_time_enabled = false;
 +      kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
        vcpu->arch.time = 0;
  }
  
@@@ -4303,8 -4284,7 +4303,8 @@@ int kvm_vm_ioctl_check_extension(struc
                r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
                    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
                    KVM_XEN_HVM_CONFIG_SHARED_INFO |
 -                  KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
 +                  KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
 +                  KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
                if (sched_info_on())
                        r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
                break;
                r = boot_cpu_has(X86_FEATURE_XSAVE);
                break;
        case KVM_CAP_TSC_CONTROL:
 +      case KVM_CAP_VM_TSC_CONTROL:
                r = kvm_has_tsc_control;
                break;
        case KVM_CAP_X2APIC_API:
@@@ -5123,7 -5102,7 +5123,7 @@@ static int kvm_vcpu_ioctl_x86_set_xcrs(
   */
  static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
  {
 -      if (!vcpu->arch.pv_time_enabled)
 +      if (!vcpu->arch.pv_time.active)
                return -EINVAL;
        vcpu->arch.pvclock_set_guest_stopped_request = true;
        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@@ -6207,7 -6186,7 +6207,7 @@@ static int kvm_arch_suspend_notifier(st
  
        mutex_lock(&kvm->lock);
        kvm_for_each_vcpu(i, vcpu, kvm) {
 -              if (!vcpu->arch.pv_time_enabled)
 +              if (!vcpu->arch.pv_time.active)
                        continue;
  
                ret = kvm_set_guest_paused(vcpu);
@@@ -6534,15 -6513,6 +6534,15 @@@ set_pit2_out
                r = kvm_xen_hvm_set_attr(kvm, &xha);
                break;
        }
 +      case KVM_XEN_HVM_EVTCHN_SEND: {
 +              struct kvm_irq_routing_xen_evtchn uxe;
 +
 +              r = -EFAULT;
 +              if (copy_from_user(&uxe, argp, sizeof(uxe)))
 +                      goto out;
 +              r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
 +              break;
 +      }
  #endif
        case KVM_SET_CLOCK:
                r = kvm_vm_ioctl_set_clock(kvm, argp);
        case KVM_GET_CLOCK:
                r = kvm_vm_ioctl_get_clock(kvm, argp);
                break;
 +      case KVM_SET_TSC_KHZ: {
 +              u32 user_tsc_khz;
 +
 +              r = -EINVAL;
 +              user_tsc_khz = (u32)arg;
 +
 +              if (kvm_has_tsc_control &&
 +                  user_tsc_khz >= kvm_max_guest_tsc_khz)
 +                      goto out;
 +
 +              if (user_tsc_khz == 0)
 +                      user_tsc_khz = tsc_khz;
 +
 +              WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
 +              r = 0;
 +
 +              goto out;
 +      }
 +      case KVM_GET_TSC_KHZ: {
 +              r = READ_ONCE(kvm->arch.default_tsc_khz);
 +              goto out;
 +      }
        case KVM_MEMORY_ENCRYPT_OP: {
                r = -ENOTTY;
                if (!kvm_x86_ops.mem_enc_ioctl)
@@@ -7281,8 -7229,15 +7281,8 @@@ static int emulator_write_emulated(stru
                                   exception, &write_emultor);
  }
  
 -#define CMPXCHG_TYPE(t, ptr, old, new) \
 -      (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
 -
 -#ifdef CONFIG_X86_64
 -#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
 -#else
 -#  define CMPXCHG64(ptr, old, new) \
 -      (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
 -#endif
 +#define emulator_try_cmpxchg_user(t, ptr, old, new) \
 +      (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
  
  static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
                                     unsigned long addr,
                                     unsigned int bytes,
                                     struct x86_exception *exception)
  {
 -      struct kvm_host_map map;
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        u64 page_line_mask;
 +      unsigned long hva;
        gpa_t gpa;
 -      char *kaddr;
 -      bool exchanged;
 +      int r;
  
        /* guests cmpxchg8b have to be emulated atomically */
        if (bytes > 8 || (bytes & (bytes - 1)))
        if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
                goto emul_write;
  
 -      if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
 +      hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
 +      if (kvm_is_error_hva(hva))
                goto emul_write;
  
 -      kaddr = map.hva + offset_in_page(gpa);
 +      hva += offset_in_page(gpa);
  
        switch (bytes) {
        case 1:
 -              exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
 +              r = emulator_try_cmpxchg_user(u8, hva, old, new);
                break;
        case 2:
 -              exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
 +              r = emulator_try_cmpxchg_user(u16, hva, old, new);
                break;
        case 4:
 -              exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
 +              r = emulator_try_cmpxchg_user(u32, hva, old, new);
                break;
        case 8:
 -              exchanged = CMPXCHG64(kaddr, old, new);
 +              r = emulator_try_cmpxchg_user(u64, hva, old, new);
                break;
        default:
                BUG();
        }
  
 -      kvm_vcpu_unmap(vcpu, &map, true);
 -
 -      if (!exchanged)
 +      if (r < 0)
 +              return X86EMUL_UNHANDLEABLE;
 +      if (r)
                return X86EMUL_CMPXCHG_FAILED;
  
        kvm_page_track_write(vcpu, gpa, new, bytes);
@@@ -8106,7 -8061,7 +8106,7 @@@ static bool reexecute_instruction(struc
            WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
                return false;
  
 -      if (!vcpu->arch.mmu->direct_map) {
 +      if (!vcpu->arch.mmu->root_role.direct) {
                /*
                 * Write permission should be allowed since only
                 * write access need to be emulated.
        kvm_release_pfn_clean(pfn);
  
        /* The instructions are well-emulated on direct mmu. */
 -      if (vcpu->arch.mmu->direct_map) {
 +      if (vcpu->arch.mmu->root_role.direct) {
                unsigned int indirect_shadow_pages;
  
                write_lock(&vcpu->kvm->mmu_lock);
@@@ -8207,7 -8162,7 +8207,7 @@@ static bool retry_instruction(struct x8
        vcpu->arch.last_retry_eip = ctxt->eip;
        vcpu->arch.last_retry_addr = cr2_or_gpa;
  
 -      if (!vcpu->arch.mmu->direct_map)
 +      if (!vcpu->arch.mmu->root_role.direct)
                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
  
        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@@ -8487,7 -8442,7 +8487,7 @@@ restart
                ctxt->exception.address = cr2_or_gpa;
  
                /* With shadow page tables, cr2 contains a GVA or nGPA. */
 -              if (vcpu->arch.mmu->direct_map) {
 +              if (vcpu->arch.mmu->root_role.direct) {
                        ctxt->gpa_available = true;
                        ctxt->gpa_val = cr2_or_gpa;
                }
@@@ -8834,22 -8789,22 +8834,22 @@@ static int kvmclock_cpu_online(unsigne
  
  static void kvm_timer_init(void)
  {
 -      max_tsc_khz = tsc_khz;
 -
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
 -#ifdef CONFIG_CPU_FREQ
 -              struct cpufreq_policy *policy;
 -              int cpu;
 -
 -              cpu = get_cpu();
 -              policy = cpufreq_cpu_get(cpu);
 -              if (policy) {
 -                      if (policy->cpuinfo.max_freq)
 -                              max_tsc_khz = policy->cpuinfo.max_freq;
 -                      cpufreq_cpu_put(policy);
 +              max_tsc_khz = tsc_khz;
 +
 +              if (IS_ENABLED(CONFIG_CPU_FREQ)) {
 +                      struct cpufreq_policy *policy;
 +                      int cpu;
 +
 +                      cpu = get_cpu();
 +                      policy = cpufreq_cpu_get(cpu);
 +                      if (policy) {
 +                              if (policy->cpuinfo.max_freq)
 +                                      max_tsc_khz = policy->cpuinfo.max_freq;
 +                              cpufreq_cpu_put(policy);
 +                      }
 +                      put_cpu();
                }
 -              put_cpu();
 -#endif
                cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
                                          CPUFREQ_TRANSITION_NOTIFIER);
        }
@@@ -9134,14 -9089,6 +9134,14 @@@ bool kvm_apicv_activated(struct kvm *kv
  }
  EXPORT_SYMBOL_GPL(kvm_apicv_activated);
  
 +bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
 +{
 +      ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
 +      ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
 +
 +      return (vm_reasons | vcpu_reasons) == 0;
 +}
 +EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
  
  static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
                                       enum kvm_apicv_inhibit reason, bool set)
@@@ -9164,7 -9111,7 +9164,7 @@@ static void kvm_apicv_init(struct kvm *
  
        if (!enable_apicv)
                set_or_clear_apicv_inhibit(inhibits,
-                                          APICV_INHIBIT_REASON_ABSENT, true);
+                                          APICV_INHIBIT_REASON_DISABLE, true);
  }
  
  static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
@@@ -9319,17 -9266,6 +9319,17 @@@ static int emulator_fix_hypercall(struc
        char instruction[3];
        unsigned long rip = kvm_rip_read(vcpu);
  
 +      /*
 +       * If the quirk is disabled, synthesize a #UD and let the guest pick up
 +       * the pieces.
 +       */
 +      if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
 +              ctxt->exception.error_code_valid = false;
 +              ctxt->exception.vector = UD_VECTOR;
 +              ctxt->have_exception = true;
 +              return X86EMUL_PROPAGATE_FAULT;
 +      }
 +
        static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
  
        return emulator_write_emulated(ctxt, rip, instruction, 3,
@@@ -9827,8 -9763,7 +9827,8 @@@ void kvm_vcpu_update_apicv(struct kvm_v
  
        down_read(&vcpu->kvm->arch.apicv_update_lock);
  
 -      activate = kvm_apicv_activated(vcpu->kvm);
 +      activate = kvm_vcpu_apicv_activated(vcpu);
 +
        if (vcpu->arch.apicv_active == activate)
                goto out;
  
@@@ -9954,6 -9889,11 +9954,11 @@@ void kvm_arch_mmu_notifier_invalidate_r
                kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
  }
  
+ void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
+ {
+       static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
+ }
  static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
  {
        if (!lapic_in_kernel(vcpu))
@@@ -10164,7 -10104,7 +10169,7 @@@ static int vcpu_enter_guest(struct kvm_
        /* Store vcpu->apicv_active before vcpu->mode.  */
        smp_store_release(&vcpu->mode, IN_GUEST_MODE);
  
-       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+       kvm_vcpu_srcu_read_unlock(vcpu);
  
        /*
         * 1) We should set ->mode before checking ->requests.  Please see
                smp_wmb();
                local_irq_enable();
                preempt_enable();
-               vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+               kvm_vcpu_srcu_read_lock(vcpu);
                r = 1;
                goto cancel_injection;
        }
                 * per-VM state, and responsing vCPUs must wait for the update
                 * to complete before servicing KVM_REQ_APICV_UPDATE.
                 */
 -              WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
 +              WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu));
  
                exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
                if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
         */
        guest_timing_exit_irqoff();
  
 -      if (lapic_in_kernel(vcpu)) {
 -              s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
 -              if (delta != S64_MIN) {
 -                      trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
 -                      vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
 -              }
 -      }
 -
        local_irq_enable();
        preempt_enable();
  
-       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+       kvm_vcpu_srcu_read_lock(vcpu);
  
        /*
         * Profile KVM exit RIPs:
  }
  
  /* Called within kvm->srcu read side.  */
- static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+ static inline int vcpu_block(struct kvm_vcpu *vcpu)
  {
        bool hv_timer;
  
                if (hv_timer)
                        kvm_lapic_switch_to_sw_timer(vcpu);
  
-               srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+               kvm_vcpu_srcu_read_unlock(vcpu);
                if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
                        kvm_vcpu_halt(vcpu);
                else
                        kvm_vcpu_block(vcpu);
-               vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+               kvm_vcpu_srcu_read_lock(vcpu);
  
                if (hv_timer)
                        kvm_lapic_switch_to_hv_timer(vcpu);
@@@ -10406,7 -10354,6 +10411,6 @@@ static inline bool kvm_vcpu_running(str
  static int vcpu_run(struct kvm_vcpu *vcpu)
  {
        int r;
-       struct kvm *kvm = vcpu->kvm;
  
        vcpu->arch.l1tf_flush_l1d = true;
  
                if (kvm_vcpu_running(vcpu)) {
                        r = vcpu_enter_guest(vcpu);
                } else {
-                       r = vcpu_block(kvm, vcpu);
+                       r = vcpu_block(vcpu);
                }
  
                if (r <= 0)
                        break;
  
                kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
 +              if (kvm_xen_has_pending_events(vcpu))
 +                      kvm_xen_inject_pending_events(vcpu);
 +
                if (kvm_cpu_has_pending_timer(vcpu))
                        kvm_inject_pending_timer_irqs(vcpu);
  
                }
  
                if (__xfer_to_guest_mode_work_pending()) {
-                       srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+                       kvm_vcpu_srcu_read_unlock(vcpu);
                        r = xfer_to_guest_mode_handle_work(vcpu);
-                       vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+                       kvm_vcpu_srcu_read_lock(vcpu);
                        if (r)
                                return r;
                }
  
  static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
  {
-       int r;
-       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-       r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
-       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-       return r;
+       return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
  }
  
  static int complete_emulated_pio(struct kvm_vcpu *vcpu)
@@@ -10546,7 -10485,6 +10545,6 @@@ static void kvm_put_guest_fpu(struct kv
  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
  {
        struct kvm_run *kvm_run = vcpu->run;
-       struct kvm *kvm = vcpu->kvm;
        int r;
  
        vcpu_load(vcpu);
        kvm_run->flags = 0;
        kvm_load_guest_fpu(vcpu);
  
-       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+       kvm_vcpu_srcu_read_lock(vcpu);
        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
                if (kvm_run->immediate_exit) {
                        r = -EINTR;
                 */
                WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
  
-               srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+               kvm_vcpu_srcu_read_unlock(vcpu);
                kvm_vcpu_block(vcpu);
-               vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+               kvm_vcpu_srcu_read_lock(vcpu);
  
                if (kvm_apic_accept_events(vcpu) < 0) {
                        r = 0;
        if (kvm_run->kvm_valid_regs)
                store_regs(vcpu);
        post_kvm_run_save(vcpu);
-       srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+       kvm_vcpu_srcu_read_unlock(vcpu);
  
        kvm_sigset_deactivate(vcpu);
        vcpu_put(vcpu);
@@@ -11047,6 -10985,9 +11045,9 @@@ static void kvm_arch_vcpu_guestdbg_upda
        struct kvm_vcpu *vcpu;
        unsigned long i;
  
+       if (!enable_apicv)
+               return;
        down_write(&kvm->arch.apicv_update_lock);
  
        kvm_for_each_vcpu(i, vcpu, kvm) {
@@@ -11258,8 -11199,21 +11259,21 @@@ int kvm_arch_vcpu_create(struct kvm_vcp
                r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
                if (r < 0)
                        goto fail_mmu_destroy;
-               if (kvm_apicv_activated(vcpu->kvm))
+               /*
+                * Defer evaluating inhibits until the vCPU is first run, as
+                * this vCPU will not get notified of any changes until this
+                * vCPU is visible to other vCPUs (marked online and added to
+                * the set of vCPUs).  Opportunistically mark APICv active as
+                * VMX in particularly is highly unlikely to have inhibits.
+                * Ignore the current per-VM APICv state so that vCPU creation
+                * is guaranteed to run with a deterministic value, the request
+                * will ensure the vCPU gets the correct state before VM-Entry.
+                */
+               if (enable_apicv) {
                        vcpu->arch.apicv_active = true;
+                       kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+               }
        } else
                static_branch_inc(&kvm_has_noapic_vcpu);
  
  
        vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
        vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
 +      kvm_xen_init_vcpu(vcpu);
        kvm_vcpu_mtrr_init(vcpu);
        vcpu_load(vcpu);
 -      kvm_set_tsc_khz(vcpu, max_tsc_khz);
 +      kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
        kvm_vcpu_reset(vcpu, false);
        kvm_init_mmu(vcpu);
        vcpu_put(vcpu);
@@@ -11367,7 -11320,6 +11381,7 @@@ void kvm_arch_vcpu_destroy(struct kvm_v
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
        fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
  
 +      kvm_xen_destroy_vcpu(vcpu);
        kvm_hv_vcpu_uninit(vcpu);
        kvm_pmu_destroy(vcpu);
        kfree(vcpu->arch.mce_banks);
@@@ -11629,24 -11581,6 +11643,24 @@@ void kvm_arch_hardware_disable(void
        drop_user_return_notifiers();
  }
  
 +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
 +{
 +      memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
 +
 +#define __KVM_X86_OP(func) \
 +      static_call_update(kvm_x86_##func, kvm_x86_ops.func);
 +#define KVM_X86_OP(func) \
 +      WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
 +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
 +#define KVM_X86_OP_OPTIONAL_RET0(func) \
 +      static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
 +                                         (void *)__static_call_return0);
 +#include <asm/kvm-x86-ops.h>
 +#undef __KVM_X86_OP
 +
 +      kvm_pmu_ops_update(ops->pmu_ops);
 +}
 +
  int kvm_arch_hardware_setup(void *opaque)
  {
        struct kvm_x86_init_ops *ops = opaque;
        if (r != 0)
                return r;
  
 -      memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
 -      kvm_ops_static_call_update();
 +      kvm_ops_update(ops);
  
        kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
  
@@@ -11777,7 -11712,6 +11791,7 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
        pvclock_update_vm_gtod_copy(kvm);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  
 +      kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
        kvm->arch.guest_can_read_msr_platform_info = true;
        kvm->arch.enable_pmu = enable_pmu;
  
@@@ -11813,15 -11747,20 +11827,15 @@@ static void kvm_unload_vcpu_mmu(struct 
        vcpu_put(vcpu);
  }
  
 -static void kvm_free_vcpus(struct kvm *kvm)
 +static void kvm_unload_vcpu_mmus(struct kvm *kvm)
  {
        unsigned long i;
        struct kvm_vcpu *vcpu;
  
 -      /*
 -       * Unpin any mmu pages first.
 -       */
        kvm_for_each_vcpu(i, vcpu, kvm) {
                kvm_clear_async_pf_completion_queue(vcpu);
                kvm_unload_vcpu_mmu(vcpu);
        }
 -
 -      kvm_destroy_vcpus(kvm);
  }
  
  void kvm_arch_sync_events(struct kvm *kvm)
@@@ -11927,12 -11866,11 +11941,12 @@@ void kvm_arch_destroy_vm(struct kvm *kv
                __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
                mutex_unlock(&kvm->slots_lock);
        }
 +      kvm_unload_vcpu_mmus(kvm);
        static_call_cond(kvm_x86_vm_destroy)(kvm);
        kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
        kvm_pic_destroy(kvm);
        kvm_ioapic_destroy(kvm);
 -      kvm_free_vcpus(kvm);
 +      kvm_destroy_vcpus(kvm);
        kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
        kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
        kvm_mmu_uninit_vm(kvm);
@@@ -12255,12 -12193,6 +12269,12 @@@ static inline bool kvm_vcpu_has_events(
            kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
                return true;
  
 +      if (kvm_xen_has_pending_events(vcpu))
 +              return true;
 +
 +      if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu))
 +              return true;
 +
        return false;
  }
  
@@@ -12358,6 -12290,25 +12372,6 @@@ void kvm_set_rflags(struct kvm_vcpu *vc
  }
  EXPORT_SYMBOL_GPL(kvm_set_rflags);
  
 -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 -{
 -      int r;
 -
 -      if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
 -            work->wakeup_all)
 -              return;
 -
 -      r = kvm_mmu_reload(vcpu);
 -      if (unlikely(r))
 -              return;
 -
 -      if (!vcpu->arch.mmu->direct_map &&
 -            work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
 -              return;
 -
 -      kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
 -}
 -
  static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
  {
        BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
@@@ -13049,7 -13000,6 +13063,7 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irt
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
 +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
diff --combined include/linux/kvm_host.h
index f94f72bbd2d3ab2c12249e9f6e99b38fe9b44cb2,34eed5f85ed607432bd40559d1f9c6356cb6f083..883e86ec8e8c4c1fdcee7ee9adb1e87875db522c
@@@ -315,7 -315,10 +315,10 @@@ struct kvm_vcpu 
        int cpu;
        int vcpu_id; /* id given by userspace at creation */
        int vcpu_idx; /* index in kvm->vcpus array */
-       int srcu_idx;
+       int ____srcu_idx; /* Don't use this directly.  You've been warned. */
+ #ifdef CONFIG_PROVE_RCU
+       int srcu_depth;
+ #endif
        int mode;
        u64 requests;
        unsigned long guest_debug;
@@@ -611,8 -614,7 +614,8 @@@ struct kvm_hv_sint 
  
  struct kvm_xen_evtchn {
        u32 port;
 -      u32 vcpu;
 +      u32 vcpu_id;
 +      int vcpu_idx;
        u32 priority;
  };
  
@@@ -725,7 -727,6 +728,7 @@@ struct kvm 
         * and is accessed atomically.
         */
        atomic_t online_vcpus;
 +      int max_vcpus;
        int created_vcpus;
        int last_boosted_vcpu;
        struct list_head vm_list;
@@@ -842,6 -843,25 +845,25 @@@ static inline void kvm_vm_bugged(struc
        unlikely(__ret);                                        \
  })
  
+ static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu)
+ {
+ #ifdef CONFIG_PROVE_RCU
+       WARN_ONCE(vcpu->srcu_depth++,
+                 "KVM: Illegal vCPU srcu_idx LOCK, depth=%d", vcpu->srcu_depth - 1);
+ #endif
+       vcpu->____srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ }
+ static inline void kvm_vcpu_srcu_read_unlock(struct kvm_vcpu *vcpu)
+ {
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->____srcu_idx);
+ #ifdef CONFIG_PROVE_RCU
+       WARN_ONCE(--vcpu->srcu_depth,
+                 "KVM: Illegal vCPU srcu_idx UNLOCK, depth=%d", vcpu->srcu_depth);
+ #endif
+ }
  static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm)
  {
        return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET);
@@@ -2199,6 -2219,8 +2221,8 @@@ static inline long kvm_arch_vcpu_async_
  void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
                                            unsigned long start, unsigned long end);
  
+ void kvm_arch_guest_memory_reclaimed(struct kvm *kvm);
  #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
  int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
  #else
diff --combined include/uapi/linux/kvm.h
index e10d131edd805e4715698555a80af173e6d9445a,32c56384fd08f43bcc6e1c0f4d21d76772a9d60f..5088bd9f1922851fb62ea7562ccbeb6b64eeed02
@@@ -444,7 -444,8 +444,9 @@@ struct kvm_run 
  #define KVM_SYSTEM_EVENT_SHUTDOWN       1
  #define KVM_SYSTEM_EVENT_RESET          2
  #define KVM_SYSTEM_EVENT_CRASH          3
- #define KVM_SYSTEM_EVENT_SEV_TERM       4
+ #define KVM_SYSTEM_EVENT_WAKEUP         4
+ #define KVM_SYSTEM_EVENT_SUSPEND        5
++#define KVM_SYSTEM_EVENT_SEV_TERM       6
                        __u32 type;
                        __u32 ndata;
                        union {
@@@ -647,6 -648,7 +649,7 @@@ struct kvm_vapic_addr 
  #define KVM_MP_STATE_OPERATING         7
  #define KVM_MP_STATE_LOAD              8
  #define KVM_MP_STATE_AP_RESET_HOLD     9
+ #define KVM_MP_STATE_SUSPENDED         10
  
  struct kvm_mp_state {
        __u32 mp_state;
@@@ -1151,8 -1153,9 +1154,9 @@@ struct kvm_ppc_resize_hpt 
  #define KVM_CAP_S390_MEM_OP_EXTENSION 211
  #define KVM_CAP_PMU_CAPABILITY 212
  #define KVM_CAP_DISABLE_QUIRKS2 213
 -/* #define KVM_CAP_VM_TSC_CONTROL 214 */
 +#define KVM_CAP_VM_TSC_CONTROL 214
  #define KVM_CAP_SYSTEM_EVENT_DATA 215
+ #define KVM_CAP_ARM_SYSTEM_SUSPEND 216
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
@@@ -1241,7 -1244,6 +1245,7 @@@ struct kvm_x86_mce 
  #define KVM_XEN_HVM_CONFIG_SHARED_INFO                (1 << 2)
  #define KVM_XEN_HVM_CONFIG_RUNSTATE           (1 << 3)
  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL      (1 << 4)
 +#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND                (1 << 5)
  
  struct kvm_xen_hvm_config {
        __u32 flags;
@@@ -1480,8 -1482,7 +1484,8 @@@ struct kvm_s390_ucas_mapping 
  #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
  /* Available with KVM_CAP_PPC_GET_PVINFO */
  #define KVM_PPC_GET_PVINFO      _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
 -/* Available with KVM_CAP_TSC_CONTROL */
 +/* Available with KVM_CAP_TSC_CONTROL for a vCPU, or with
 +*  KVM_CAP_VM_TSC_CONTROL to set defaults for a VM */
  #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
  #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
  /* Available with KVM_CAP_PCI_2_3 */
@@@ -1697,32 -1698,6 +1701,32 @@@ struct kvm_xen_hvm_attr 
                struct {
                        __u64 gfn;
                } shared_info;
 +              struct {
 +                      __u32 send_port;
 +                      __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
 +                      __u32 flags;
 +#define KVM_XEN_EVTCHN_DEASSIGN               (1 << 0)
 +#define KVM_XEN_EVTCHN_UPDATE         (1 << 1)
 +#define KVM_XEN_EVTCHN_RESET          (1 << 2)
 +                      /*
 +                       * Events sent by the guest are either looped back to
 +                       * the guest itself (potentially on a different port#)
 +                       * or signalled via an eventfd.
 +                       */
 +                      union {
 +                              struct {
 +                                      __u32 port;
 +                                      __u32 vcpu;
 +                                      __u32 priority;
 +                              } port;
 +                              struct {
 +                                      __u32 port; /* Zero for eventfd */
 +                                      __s32 fd;
 +                              } eventfd;
 +                              __u32 padding[4];
 +                      } deliver;
 +              } evtchn;
 +              __u32 xen_version;
                __u64 pad[8];
        } u;
  };
  #define KVM_XEN_ATTR_TYPE_LONG_MODE           0x0
  #define KVM_XEN_ATTR_TYPE_SHARED_INFO         0x1
  #define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR               0x2
 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
 +#define KVM_XEN_ATTR_TYPE_EVTCHN              0x3
 +#define KVM_XEN_ATTR_TYPE_XEN_VERSION         0x4
  
  /* Per-vCPU Xen attributes */
  #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
  #define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO,  0xcb, struct kvm_xen_vcpu_attr)
  
 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
 +#define KVM_XEN_HVM_EVTCHN_SEND       _IOW(KVMIO,  0xd0, struct kvm_irq_routing_xen_evtchn)
 +
  #define KVM_GET_SREGS2             _IOR(KVMIO,  0xcc, struct kvm_sregs2)
  #define KVM_SET_SREGS2             _IOW(KVMIO,  0xcd, struct kvm_sregs2)
  
@@@ -1759,13 -1728,6 +1763,13 @@@ struct kvm_xen_vcpu_attr 
                        __u64 time_blocked;
                        __u64 time_offline;
                } runstate;
 +              __u32 vcpu_id;
 +              struct {
 +                      __u32 port;
 +                      __u32 priority;
 +                      __u64 expires_ns;
 +              } timer;
 +              __u8 vector;
        } u;
  };
  
  #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT       0x3
  #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA  0x4
  #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST        0x5
 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID                0x6
 +#define KVM_XEN_VCPU_ATTR_TYPE_TIMER          0x7
 +#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR  0x8
  
  /* Secure Encrypted Virtualization command */
  enum sev_cmd_id {
index 56140068b7637a930c6873c73e443c22369beb0f,b17e464ec661b3967fe49a5f7876c9883e913df7..54fc269c1788cba2da5ad7e2156df29e290c7cac
@@@ -2,7 -2,8 +2,8 @@@
  /aarch64/arch_timer
  /aarch64/debug-exceptions
  /aarch64/get-reg-list
- /aarch64/psci_cpu_on_test
+ /aarch64/hypercalls
+ /aarch64/psci_test
  /aarch64/vcpu_width_config
  /aarch64/vgic_init
  /aarch64/vgic_irq
@@@ -16,7 -17,6 +17,7 @@@
  /x86_64/debug_regs
  /x86_64/evmcs_test
  /x86_64/emulator_error_test
 +/x86_64/fix_hypercall_test
  /x86_64/get_msr_index_features
  /x86_64/kvm_clock_test
  /x86_64/kvm_pv_test
index d0d09ca0d495739608814f9d24cbbf088a3637f3,97eef0c03d3bfa46c5e97f3641a56b8f76aadc60..81470a99ed1c04ab4bfeee5ca438b170327573e8
@@@ -48,7 -48,6 +48,7 @@@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpu
  TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
  TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
  TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
 +TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
  TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
  TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
  TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
@@@ -66,7 -65,6 +66,7 @@@ TEST_GEN_PROGS_x86_64 += x86_64/state_t
  TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
  TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
  TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
 +TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync
  TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
  TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
  TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
@@@ -83,7 -81,7 +83,7 @@@ TEST_GEN_PROGS_x86_64 += x86_64/xapic_s
  TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
  TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
  TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
 -TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test
 +TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test
  TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test
  TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test
  TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
@@@ -107,7 -105,8 +107,8 @@@ TEST_GEN_PROGS_x86_64 += system_counter
  TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
  TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
  TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
- TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test
+ TEST_GEN_PROGS_aarch64 += aarch64/hypercalls
+ TEST_GEN_PROGS_aarch64 += aarch64/psci_test
  TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config
  TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
  TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
index 5e6f40633ea669b0b1ef3184890fd836cfc30812,bcd3708278593dd14585d32510d5cda20a98c677..7a51bb648fbb3cbf534203eed1be1cff9d2672a7
@@@ -23,7 -23,6 +23,6 @@@
  #define SHINFO_REGION_GVA     0xc0000000ULL
  #define SHINFO_REGION_GPA     0xc0000000ULL
  #define SHINFO_REGION_SLOT    10
- #define PAGE_SIZE             4096
  
  #define DUMMY_REGION_GPA      (SHINFO_REGION_GPA + (2 * PAGE_SIZE))
  #define DUMMY_REGION_SLOT     11
  
  #define EVTCHN_VECTOR 0x10
  
 +#define EVTCHN_TEST1 15
 +#define EVTCHN_TEST2 66
 +#define EVTCHN_TIMER 13
 +
  static struct kvm_vm *vm;
  
  #define XEN_HYPERCALL_MSR     0x40000000
  
  #define MIN_STEAL_TIME                50000
  
 +#define __HYPERVISOR_set_timer_op     15
 +#define __HYPERVISOR_sched_op         29
 +#define __HYPERVISOR_event_channel_op 32
 +
 +#define SCHEDOP_poll                  3
 +
 +#define EVTCHNOP_send                 4
 +
 +#define EVTCHNSTAT_interdomain                2
 +
 +struct evtchn_send {
 +      u32 port;
 +};
 +
 +struct sched_poll {
 +      u32 *ports;
 +      unsigned int nr_ports;
 +      u64 timeout;
 +};
 +
  struct pvclock_vcpu_time_info {
        u32   version;
        u32   pad0;
@@@ -131,25 -106,15 +130,25 @@@ struct 
        struct kvm_irq_routing_entry entries[2];
  } irq_routes;
  
 +bool guest_saw_irq;
 +
  static void evtchn_handler(struct ex_regs *regs)
  {
        struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
        vi->evtchn_upcall_pending = 0;
        vi->evtchn_pending_sel = 0;
 +      guest_saw_irq = true;
  
        GUEST_SYNC(0x20);
  }
  
 +static void guest_wait_for_irq(void)
 +{
 +      while (!guest_saw_irq)
 +              __asm__ __volatile__ ("rep nop" : : : "memory");
 +      guest_saw_irq = false;
 +}
 +
  static void guest_code(void)
  {
        struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
        /* Trigger an interrupt injection */
        GUEST_SYNC(0);
  
 +      guest_wait_for_irq();
 +
        /* Test having the host set runstates manually */
        GUEST_SYNC(RUNSTATE_runnable);
        GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
        /* Now deliver an *unmasked* interrupt */
        GUEST_SYNC(8);
  
 -      while (!si->evtchn_pending[1])
 -              __asm__ __volatile__ ("rep nop" : : : "memory");
 +      guest_wait_for_irq();
  
        /* Change memslots and deliver an interrupt */
        GUEST_SYNC(9);
  
 -      for (;;)
 -              __asm__ __volatile__ ("rep nop" : : : "memory");
 +      guest_wait_for_irq();
 +
 +      /* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */
 +      GUEST_SYNC(10);
 +
 +      guest_wait_for_irq();
 +
 +      GUEST_SYNC(11);
 +
 +      /* Our turn. Deliver event channel (to ourselves) with
 +       * EVTCHNOP_send hypercall. */
 +      unsigned long rax;
 +      struct evtchn_send s = { .port = 127 };
 +      __asm__ __volatile__ ("vmcall" :
 +                            "=a" (rax) :
 +                            "a" (__HYPERVISOR_event_channel_op),
 +                            "D" (EVTCHNOP_send),
 +                            "S" (&s));
 +
 +      GUEST_ASSERT(rax == 0);
 +
 +      guest_wait_for_irq();
 +
 +      GUEST_SYNC(12);
 +
 +      /* Deliver "outbound" event channel to an eventfd which
 +       * happens to be one of our own irqfds. */
 +      s.port = 197;
 +      __asm__ __volatile__ ("vmcall" :
 +                            "=a" (rax) :
 +                            "a" (__HYPERVISOR_event_channel_op),
 +                            "D" (EVTCHNOP_send),
 +                            "S" (&s));
 +
 +      GUEST_ASSERT(rax == 0);
 +
 +      guest_wait_for_irq();
 +
 +      GUEST_SYNC(13);
 +
 +      /* Set a timer 100ms in the future. */
 +      __asm__ __volatile__ ("vmcall" :
 +                            "=a" (rax) :
 +                            "a" (__HYPERVISOR_set_timer_op),
 +                            "D" (rs->state_entry_time + 100000000));
 +      GUEST_ASSERT(rax == 0);
 +
 +      GUEST_SYNC(14);
 +
 +      /* Now wait for the timer */
 +      guest_wait_for_irq();
 +
 +      GUEST_SYNC(15);
 +
 +      /* The host has 'restored' the timer. Just wait for it. */
 +      guest_wait_for_irq();
 +
 +      GUEST_SYNC(16);
 +
 +      /* Poll for an event channel port which is already set */
 +      u32 ports[1] = { EVTCHN_TIMER };
 +      struct sched_poll p = {
 +              .ports = ports,
 +              .nr_ports = 1,
 +              .timeout = 0,
 +      };
 +
 +      __asm__ __volatile__ ("vmcall" :
 +                            "=a" (rax) :
 +                            "a" (__HYPERVISOR_sched_op),
 +                            "D" (SCHEDOP_poll),
 +                            "S" (&p));
 +
 +      GUEST_ASSERT(rax == 0);
 +
 +      GUEST_SYNC(17);
 +
 +      /* Poll for an unset port and wait for the timeout. */
 +      p.timeout = 100000000;
 +      __asm__ __volatile__ ("vmcall" :
 +                            "=a" (rax) :
 +                            "a" (__HYPERVISOR_sched_op),
 +                            "D" (SCHEDOP_poll),
 +                            "S" (&p));
 +
 +      GUEST_ASSERT(rax == 0);
 +
 +      GUEST_SYNC(18);
 +
 +      /* A timer will wake the masked port we're waiting on, while we poll */
 +      p.timeout = 0;
 +      __asm__ __volatile__ ("vmcall" :
 +                            "=a" (rax) :
 +                            "a" (__HYPERVISOR_sched_op),
 +                            "D" (SCHEDOP_poll),
 +                            "S" (&p));
 +
 +      GUEST_ASSERT(rax == 0);
 +
 +      GUEST_SYNC(19);
 +
 +      /* A timer wake an *unmasked* port which should wake us with an
 +       * actual interrupt, while we're polling on a different port. */
 +      ports[0]++;
 +      p.timeout = 0;
 +      __asm__ __volatile__ ("vmcall" :
 +                            "=a" (rax) :
 +                            "a" (__HYPERVISOR_sched_op),
 +                            "D" (SCHEDOP_poll),
 +                            "S" (&p));
 +
 +      GUEST_ASSERT(rax == 0);
 +
 +      guest_wait_for_irq();
 +
 +      GUEST_SYNC(20);
 +
 +      /* Timer should have fired already */
 +      guest_wait_for_irq();
 +
 +      GUEST_SYNC(21);
  }
  
  static int cmp_timespec(struct timespec *a, struct timespec *b)
        else
                return 0;
  }
 +struct vcpu_info *vinfo;
  
  static void handle_alrm(int sig)
  {
 +      if (vinfo)
 +              printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending);
 +      vcpu_dump(stdout, vm, VCPU_ID, 0);
        TEST_FAIL("IRQ delivery timed out");
  }
  
@@@ -371,7 -212,6 +370,7 @@@ int main(int argc, char *argv[]
  
        bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
        bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
 +      bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
  
        clock_gettime(CLOCK_REALTIME, &min_ts);
  
                .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
                .msr = XEN_HYPERCALL_MSR,
        };
 +
 +      /* Let the kernel know that we *will* use it for sending all
 +       * event channels, which lets it intercept SCHEDOP_poll */
 +      if (do_evtchn_tests)
 +              hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
 +
        vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
  
        struct kvm_xen_hvm_attr lm = {
  
                /* Unexpected, but not a KVM failure */
                if (irq_fd[0] == -1 || irq_fd[1] == -1)
 -                      do_eventfd_tests = false;
 +                      do_evtchn_tests = do_eventfd_tests = false;
        }
  
        if (do_eventfd_tests) {
  
                irq_routes.entries[0].gsi = 32;
                irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
 -              irq_routes.entries[0].u.xen_evtchn.port = 15;
 +              irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1;
                irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID;
                irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
  
                irq_routes.entries[1].gsi = 33;
                irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
 -              irq_routes.entries[1].u.xen_evtchn.port = 66;
 +              irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2;
                irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID;
                irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
  
                sigaction(SIGALRM, &sa, NULL);
        }
  
 -      struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
 +      struct kvm_xen_vcpu_attr tmr = {
 +              .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
 +              .u.timer.port = EVTCHN_TIMER,
 +              .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
 +              .u.timer.expires_ns = 0
 +      };
 +
 +      if (do_evtchn_tests) {
 +              struct kvm_xen_hvm_attr inj = {
 +                      .type = KVM_XEN_ATTR_TYPE_EVTCHN,
 +                      .u.evtchn.send_port = 127,
 +                      .u.evtchn.type = EVTCHNSTAT_interdomain,
 +                      .u.evtchn.flags = 0,
 +                      .u.evtchn.deliver.port.port = EVTCHN_TEST1,
 +                      .u.evtchn.deliver.port.vcpu = VCPU_ID + 1,
 +                      .u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
 +              };
 +              vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
 +
 +              /* Test migration to a different vCPU */
 +              inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE;
 +              inj.u.evtchn.deliver.port.vcpu = VCPU_ID;
 +              vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
 +
 +              inj.u.evtchn.send_port = 197;
 +              inj.u.evtchn.deliver.eventfd.port = 0;
 +              inj.u.evtchn.deliver.eventfd.fd = irq_fd[1];
 +              inj.u.evtchn.flags = 0;
 +              vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
 +
 +              vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
 +      }
 +      vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
        vinfo->evtchn_upcall_pending = 0;
  
        struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
                                        goto done;
                                if (verbose)
                                        printf("Testing masked event channel\n");
 -                              shinfo->evtchn_mask[0] = 0x8000;
 +                              shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1;
                                eventfd_write(irq_fd[0], 1UL);
                                alarm(1);
                                break;
                                break;
  
                        case 9:
 +                              TEST_ASSERT(!evtchn_irq_expected,
 +                                          "Expected event channel IRQ but it didn't happen");
 +                              shinfo->evtchn_pending[1] = 0;
                                if (verbose)
                                        printf("Testing event channel after memslot change\n");
                                vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
                                alarm(1);
                                break;
  
 +                      case 10:
 +                              TEST_ASSERT(!evtchn_irq_expected,
 +                                          "Expected event channel IRQ but it didn't happen");
 +                              if (!do_evtchn_tests)
 +                                      goto done;
 +
 +                              shinfo->evtchn_pending[0] = 0;
 +                              if (verbose)
 +                                      printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n");
 +
 +                              struct kvm_irq_routing_xen_evtchn e;
 +                              e.port = EVTCHN_TEST2;
 +                              e.vcpu = VCPU_ID;
 +                              e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
 +
 +                              vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e);
 +                              evtchn_irq_expected = true;
 +                              alarm(1);
 +                              break;
 +
 +                      case 11:
 +                              TEST_ASSERT(!evtchn_irq_expected,
 +                                          "Expected event channel IRQ but it didn't happen");
 +                              shinfo->evtchn_pending[1] = 0;
 +
 +                              if (verbose)
 +                                      printf("Testing guest EVTCHNOP_send direct to evtchn\n");
 +                              evtchn_irq_expected = true;
 +                              alarm(1);
 +                              break;
 +
 +                      case 12:
 +                              TEST_ASSERT(!evtchn_irq_expected,
 +                                          "Expected event channel IRQ but it didn't happen");
 +                              shinfo->evtchn_pending[0] = 0;
 +
 +                              if (verbose)
 +                                      printf("Testing guest EVTCHNOP_send to eventfd\n");
 +                              evtchn_irq_expected = true;
 +                              alarm(1);
 +                              break;
 +
 +                      case 13:
 +                              TEST_ASSERT(!evtchn_irq_expected,
 +                                          "Expected event channel IRQ but it didn't happen");
 +                              shinfo->evtchn_pending[1] = 0;
 +
 +                              if (verbose)
 +                                      printf("Testing guest oneshot timer\n");
 +                              break;
 +
 +                      case 14:
 +                              memset(&tmr, 0, sizeof(tmr));
 +                              tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER;
 +                              vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
 +                              TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER,
 +                                          "Timer port not returned");
 +                              TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
 +                                          "Timer priority not returned");
 +                              TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time,
 +                                          "Timer expiry not returned");
 +                              evtchn_irq_expected = true;
 +                              alarm(1);
 +                              break;
 +
 +                      case 15:
 +                              TEST_ASSERT(!evtchn_irq_expected,
 +                                          "Expected event channel IRQ but it didn't happen");
 +                              shinfo->evtchn_pending[0] = 0;
 +
 +                              if (verbose)
 +                                      printf("Testing restored oneshot timer\n");
 +
 +                              tmr.u.timer.expires_ns = rs->state_entry_time + 100000000,
 +                              vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
 +                              evtchn_irq_expected = true;
 +                              alarm(1);
 +                              break;
 +
 +                      case 16:
 +                              TEST_ASSERT(!evtchn_irq_expected,
 +                                          "Expected event channel IRQ but it didn't happen");
 +
 +                              if (verbose)
 +                                      printf("Testing SCHEDOP_poll with already pending event\n");
 +                              shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER;
 +                              alarm(1);
 +                              break;
 +
 +                      case 17:
 +                              if (verbose)
 +                                      printf("Testing SCHEDOP_poll timeout\n");
 +                              shinfo->evtchn_pending[0] = 0;
 +                              alarm(1);
 +                              break;
 +
 +                      case 18:
 +                              if (verbose)
 +                                      printf("Testing SCHEDOP_poll wake on masked event\n");
 +
 +                              tmr.u.timer.expires_ns = rs->state_entry_time + 100000000,
 +                              vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
 +                              alarm(1);
 +                              break;
 +
 +                      case 19:
 +                              shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0;
 +                              if (verbose)
 +                                      printf("Testing SCHEDOP_poll wake on unmasked event\n");
 +
 +                              evtchn_irq_expected = true;
 +                              tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
 +                              vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
 +
 +                              /* Read it back and check the pending time is reported correctly */
 +                              tmr.u.timer.expires_ns = 0;
 +                              vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
 +                              TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000,
 +                                          "Timer not reported pending");
 +                              alarm(1);
 +                              break;
 +
 +                      case 20:
 +                              TEST_ASSERT(!evtchn_irq_expected,
 +                                          "Expected event channel IRQ but it didn't happen");
 +                              /* Read timer and check it is no longer pending */
 +                              vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
 +                              TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending");
 +
 +                              shinfo->evtchn_pending[0] = 0;
 +                              if (verbose)
 +                                      printf("Testing timer in the past\n");
 +
 +                              evtchn_irq_expected = true;
 +                              tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL;
 +                              vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
 +                              alarm(1);
 +                              break;
 +
 +                      case 21:
 +                              TEST_ASSERT(!evtchn_irq_expected,
 +                                          "Expected event channel IRQ but it didn't happen");
 +                              goto done;
 +
                        case 0x20:
                                TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
                                evtchn_irq_expected = false;
 -                              if (shinfo->evtchn_pending[1] &&
 -                                  shinfo->evtchn_pending[0])
 -                                      goto done;
                                break;
                        }
                        break;
        }
  
   done:
 +      alarm(0);
        clock_gettime(CLOCK_REALTIME, &max_ts);
  
        /*
diff --combined virt/kvm/kvm_main.c
index 655365b2cbe818b5b2a81227a4391f729cec7f09,6d971fb1b08d8ed196e139e54004b0ac9f5fab0c..342043b301254c3fc590fb9cfb4fb05ee487fa0e
@@@ -164,6 -164,10 +164,10 @@@ __weak void kvm_arch_mmu_notifier_inval
  {
  }
  
+ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
+ {
+ }
  bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
  {
        /*
@@@ -357,6 -361,12 +361,12 @@@ void kvm_flush_remote_tlbs(struct kvm *
  EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
  #endif
  
+ static void kvm_flush_shadow_all(struct kvm *kvm)
+ {
+       kvm_arch_flush_shadow_all(kvm);
+       kvm_arch_guest_memory_reclaimed(kvm);
+ }
  #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
  static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
                                               gfp_t gfp_flags)
@@@ -485,12 -495,15 +495,15 @@@ typedef bool (*hva_handler_t)(struct kv
  typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
                             unsigned long end);
  
+ typedef void (*on_unlock_fn_t)(struct kvm *kvm);
  struct kvm_hva_range {
        unsigned long start;
        unsigned long end;
        pte_t pte;
        hva_handler_t handler;
        on_lock_fn_t on_lock;
+       on_unlock_fn_t on_unlock;
        bool flush_on_ret;
        bool may_block;
  };
@@@ -578,8 -591,11 +591,11 @@@ static __always_inline int __kvm_handle
        if (range->flush_on_ret && ret)
                kvm_flush_remote_tlbs(kvm);
  
-       if (locked)
+       if (locked) {
                KVM_MMU_UNLOCK(kvm);
+               if (!IS_KVM_NULL_FN(range->on_unlock))
+                       range->on_unlock(kvm);
+       }
  
        srcu_read_unlock(&kvm->srcu, idx);
  
@@@ -600,6 -616,7 +616,7 @@@ static __always_inline int kvm_handle_h
                .pte            = pte,
                .handler        = handler,
                .on_lock        = (void *)kvm_null_fn,
+               .on_unlock      = (void *)kvm_null_fn,
                .flush_on_ret   = true,
                .may_block      = false,
        };
@@@ -619,6 -636,7 +636,7 @@@ static __always_inline int kvm_handle_h
                .pte            = __pte(0),
                .handler        = handler,
                .on_lock        = (void *)kvm_null_fn,
+               .on_unlock      = (void *)kvm_null_fn,
                .flush_on_ret   = false,
                .may_block      = false,
        };
@@@ -662,7 -680,7 +680,7 @@@ void kvm_inc_notifier_count(struct kvm 
                kvm->mmu_notifier_range_end = end;
        } else {
                /*
-                * Fully tracking multiple concurrent ranges has dimishing
+                * Fully tracking multiple concurrent ranges has diminishing
                 * returns. Keep things simple and just find the minimal range
                 * which includes the current and new ranges. As there won't be
                 * enough information to subtract a range after its invalidate
@@@ -687,6 -705,7 +705,7 @@@ static int kvm_mmu_notifier_invalidate_
                .pte            = __pte(0),
                .handler        = kvm_unmap_gfn_range,
                .on_lock        = kvm_inc_notifier_count,
+               .on_unlock      = kvm_arch_guest_memory_reclaimed,
                .flush_on_ret   = true,
                .may_block      = mmu_notifier_range_blockable(range),
        };
@@@ -741,6 -760,7 +760,7 @@@ static void kvm_mmu_notifier_invalidate
                .pte            = __pte(0),
                .handler        = (void *)kvm_null_fn,
                .on_lock        = kvm_dec_notifier_count,
+               .on_unlock      = (void *)kvm_null_fn,
                .flush_on_ret   = false,
                .may_block      = mmu_notifier_range_blockable(range),
        };
@@@ -813,7 -833,7 +833,7 @@@ static void kvm_mmu_notifier_release(st
        int idx;
  
        idx = srcu_read_lock(&kvm->srcu);
-       kvm_arch_flush_shadow_all(kvm);
+       kvm_flush_shadow_all(kvm);
        srcu_read_unlock(&kvm->srcu, idx);
  }
  
@@@ -955,12 -975,6 +975,6 @@@ static int kvm_create_vm_debugfs(struc
        int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
                                      kvm_vcpu_stats_header.num_desc;
  
-       /*
-        * Force subsequent debugfs file creations to fail if the VM directory
-        * is not created.
-        */
-       kvm->debugfs_dentry = ERR_PTR(-ENOENT);
        if (!debugfs_initialized())
                return 0;
  
@@@ -1078,10 -1092,15 +1092,16 @@@ static struct kvm *kvm_create_vm(unsign
        spin_lock_init(&kvm->gpc_lock);
  
        INIT_LIST_HEAD(&kvm->devices);
 +      kvm->max_vcpus = KVM_MAX_VCPUS;
  
        BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
  
+       /*
+        * Force subsequent debugfs file creations to fail if the VM directory
+        * is not created (by kvm_create_vm_debugfs()).
+        */
+       kvm->debugfs_dentry = ERR_PTR(-ENOENT);
        if (init_srcu_struct(&kvm->srcu))
                goto out_err_no_srcu;
        if (init_srcu_struct(&kvm->irq_srcu))
@@@ -1226,7 -1245,7 +1246,7 @@@ static void kvm_destroy_vm(struct kvm *
        WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
        kvm->mn_active_invalidate_count = 0;
  #else
-       kvm_arch_flush_shadow_all(kvm);
+       kvm_flush_shadow_all(kvm);
  #endif
        kvm_arch_destroy_vm(kvm);
        kvm_destroy_devices(kvm);
@@@ -1653,6 -1672,7 +1673,7 @@@ static void kvm_invalidate_memslot(stru
         *      - kvm_is_visible_gfn (mmu_check_root)
         */
        kvm_arch_flush_shadow_memslot(kvm, old);
+       kvm_arch_guest_memory_reclaimed(kvm);
  
        /* Was released by kvm_swap_active_memslots, reacquire. */
        mutex_lock(&kvm->slots_arch_lock);
@@@ -1800,7 -1820,7 +1821,7 @@@ static int kvm_set_memslot(struct kvm *
  
        /*
         * No need to refresh new->arch, changes after dropping slots_arch_lock
-        * will directly hit the final, active memsot.  Architectures are
+        * will directly hit the final, active memslot.  Architectures are
         * responsible for knowing that new->arch may be stale.
         */
        kvm_commit_memory_region(kvm, old, new, change);
@@@ -3733,7 -3753,7 +3754,7 @@@ static int kvm_vm_ioctl_create_vcpu(str
                return -EINVAL;
  
        mutex_lock(&kvm->lock);
 -      if (kvm->created_vcpus == KVM_MAX_VCPUS) {
 +      if (kvm->created_vcpus >= kvm->max_vcpus) {
                mutex_unlock(&kvm->lock);
                return -EINVAL;
        }
This page took 0.344088 seconds and 4 git commands to generate.