Merge tag 'kvm-x86-misc-6.10' of https://github.com/kvm-x86/linux into HEAD

author Paolo Bonzini <[email protected]>

Sun, 12 May 2024 07:18:44 +0000 (03:18 -0400)

committer Paolo Bonzini <[email protected]>

Sun, 12 May 2024 07:18:44 +0000 (03:18 -0400)
author Paolo Bonzini <[email protected]>
Sun, 12 May 2024 07:18:44 +0000 (03:18 -0400)
committer Paolo Bonzini <[email protected]>
Sun, 12 May 2024 07:18:44 +0000 (03:18 -0400)
diff --combined arch/x86/kvm/cpuid.c

index 1851b3870a9c05cb2c8bba19b4992a7486d1f207,1c5583addc90be701c8d763319a18b95a25bc06b..f2f2be5d11415d38eaf8d854cd86cb171bf2937b
--- 1/arch/x86/kvm/cpuid.c
--- 2/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@@ -376,7 -376,6 +376,7 @@@ static void kvm_vcpu_after_set_cpuid(st
   
         kvm_update_pv_runtime(vcpu);
   
+ +      vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu);
         vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
         vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
   
@@@ -772,7 -771,7 +772,7 @@@ void kvm_set_cpu_caps(void
         kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
   
         kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
- -              0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
+ +              0 /* SME */ | 0 /* SEV */ | 0 /* VM_PAGE_FLUSH */ | 0 /* SEV_ES */ |
                 F(SME_COHERENT));
   
         kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
@@@ -1232,9 -1231,22 +1232,22 @@@ static inline int __do_cpuid_func(struc
                 entry->eax = entry->ebx = entry->ecx = 0;
                 break;
         case 0x80000008: {
-               unsigned g_phys_as = (entry->eax >> 16) & 0xff;
-               unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
-               unsigned phys_as = entry->eax & 0xff;
+               /*
+                * GuestPhysAddrSize (EAX[23:16]) is intended for software
+                * use.
+                *
+                * KVM's ABI is to report the effective MAXPHYADDR for the
+                * guest in PhysAddrSize (phys_as), and the maximum
+                * *addressable* GPA in GuestPhysAddrSize (g_phys_as).
+                *
+                * GuestPhysAddrSize is valid if and only if TDP is enabled,
+                * in which case the max GPA that can be addressed by KVM may
+                * be less than the max GPA that can be legally generated by
+                * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't
+                * support 5-level TDP.
+                */
+               unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U);
+               unsigned int phys_as, g_phys_as;
   
                 /*
                  * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
@@@ -1242,16 -1254,24 +1255,24 @@@
                  * reductions in MAXPHYADDR for memory encryption affect shadow
                  * paging, too.
                  *
-                * If TDP is enabled but an explicit guest MAXPHYADDR is not
-                * provided, use the raw bare metal MAXPHYADDR as reductions to
-                * the HPAs do not affect GPAs.
+                * If TDP is enabled, use the raw bare metal MAXPHYADDR as
+                * reductions to the HPAs do not affect GPAs.  The max
+                * addressable GPA is the same as the max effective GPA, except
+                * that it's capped at 48 bits if 5-level TDP isn't supported
+                * (hardware processes bits 51:48 only when walking the fifth
+                * level page table).
                  */
-               if (!tdp_enabled)
-                       g_phys_as = boot_cpu_data.x86_phys_bits;
-               else if (!g_phys_as)
+               if (!tdp_enabled) {
+                       phys_as = boot_cpu_data.x86_phys_bits;
+                       g_phys_as = 0;
+               } else {
+                       phys_as = entry->eax & 0xff;
                         g_phys_as = phys_as;
+                       if (kvm_mmu_get_max_tdp_level() < 5)
+                               g_phys_as = min(g_phys_as, 48);
+               }
   
-               entry->eax = g_phys_as | (virt_as << 8);
+               entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16);
                 entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
                 entry->edx = 0;
                 cpuid_entry_override(entry, CPUID_8000_0008_EBX);
diff --combined arch/x86/kvm/mmu.h

index 2343c9f00e318a0d2e9971b146a414c35b997421,b410a227c6018d93e57caafbd1b8313e18e2efba..2e454316f2a2d1b45b9b1929a9e5397a9c3f63a7
--- 1/arch/x86/kvm/mmu.h
--- 2/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@@ -100,6 -100,8 +100,8 @@@ static inline u8 kvm_get_shadow_phys_bi
         return boot_cpu_data.x86_phys_bits;
   }
   
+ u8 kvm_mmu_get_max_tdp_level(void);
+ 
   void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
   void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
   void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
@@@ -213,7 -215,7 +215,7 @@@ static inline u8 permission_fault(struc
          */
         u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
         bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
- -      int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
+ +      int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
         u32 errcode = PFERR_PRESENT_MASK;
         bool fault;
   
@@@ -234,7 -236,8 +236,7 @@@
                 pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
   
                 /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
- -              offset = (pfec & ~1) +
- -                      ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
+ +              offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);
   
                 pkru_bits &= mmu->pkru_mask >> offset;
                 errcode |= -pkru_bits & PFERR_PK_MASK;
diff --combined arch/x86/kvm/mmu/mmu.c

index 99f7b2f3d82a2c9b4192c9bd88af7bce72b42735,db3a26eb7b75beacecdb73eea42c8d2f97d8f4d6..662f62dfb2aa9f0d205ee02a88da0b687c2291d6
--- 1/arch/x86/kvm/mmu/mmu.c
--- 2/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@@ -432,8 -432,8 +432,8 @@@ static u64 __update_clear_spte_slow(u6
    * The idea using the light way get the spte on x86_32 guest is from
    * gup_get_pte (mm/gup.c).
    *
- - * An spte tlb flush may be pending, because kvm_set_pte_rmap
- - * coalesces them and we are running out of the MMU lock.  Therefore
+ + * An spte tlb flush may be pending, because they are coalesced and
+ + * we are running out of the MMU lock.  Therefore
    * we need to protect against in-progress updates of the spte.
    *
    * Reading the spte while an update is in progress may get the old value
@@@ -567,9 -567,9 +567,9 @@@ static u64 mmu_spte_clear_track_bits(st
   
         if (!is_shadow_present_pte(old_spte) ||
             !spte_has_volatile_bits(old_spte))
- -              __update_clear_spte_fast(sptep, 0ull);
+ +              __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
         else
- -              old_spte = __update_clear_spte_slow(sptep, 0ull);
+ +              old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
   
         if (!is_shadow_present_pte(old_spte))
                 return old_spte;
@@@ -603,7 -603,7 +603,7 @@@
    */
   static void mmu_spte_clear_no_track(u64 *sptep)
   {
- -      __update_clear_spte_fast(sptep, 0ull);
+ +      __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
   }
   
   static u64 mmu_spte_get_lockless(u64 *sptep)
@@@ -831,15 -831,6 +831,15 @@@ static void account_shadowed(struct kv
         gfn_t gfn;
   
         kvm->arch.indirect_shadow_pages++;
+ +      /*
+ +       * Ensure indirect_shadow_pages is elevated prior to re-reading guest
+ +       * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight
+ +       * emulated writes are visible before re-reading guest PTEs, or that
+ +       * an emulated write will see the elevated count and acquire mmu_lock
+ +       * to update SPTEs.  Pairs with the smp_mb() in kvm_mmu_track_write().
+ +       */
+ +      smp_mb();
+ +
         gfn = sp->gfn;
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, gfn);
@@@ -1457,11 -1448,49 +1457,11 @@@ static bool __kvm_zap_rmap(struct kvm *
   }
   
   static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- -                       struct kvm_memory_slot *slot, gfn_t gfn, int level,
- -                       pte_t unused)
+ +                       struct kvm_memory_slot *slot, gfn_t gfn, int level)
   {
         return __kvm_zap_rmap(kvm, rmap_head, slot);
   }
   
- -static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- -                           struct kvm_memory_slot *slot, gfn_t gfn, int level,
- -                           pte_t pte)
- -{
- -      u64 *sptep;
- -      struct rmap_iterator iter;
- -      bool need_flush = false;
- -      u64 new_spte;
- -      kvm_pfn_t new_pfn;
- -
- -      WARN_ON_ONCE(pte_huge(pte));
- -      new_pfn = pte_pfn(pte);
- -
- -restart:
- -      for_each_rmap_spte(rmap_head, &iter, sptep) {
- -              need_flush = true;
- -
- -              if (pte_write(pte)) {
- -                      kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
- -                      goto restart;
- -              } else {
- -                      new_spte = kvm_mmu_changed_pte_notifier_make_spte(
- -                                      *sptep, new_pfn);
- -
- -                      mmu_spte_clear_track_bits(kvm, sptep);
- -                      mmu_spte_set(sptep, new_spte);
- -              }
- -      }
- -
- -      if (need_flush && kvm_available_flush_remote_tlbs_range()) {
- -              kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
- -              return false;
- -      }
- -
- -      return need_flush;
- -}
- -
   struct slot_rmap_walk_iterator {
         /* input fields. */
         const struct kvm_memory_slot *slot;
@@@ -1533,7 -1562,7 +1533,7 @@@ static void slot_rmap_walk_next(struct 
   
   typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
                                struct kvm_memory_slot *slot, gfn_t gfn,
- -                             int level, pte_t pte);
+ +                             int level);
   
   static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
                                                  struct kvm_gfn_range *range,
@@@ -1545,7 -1574,7 +1545,7 @@@
         for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
                                  range->start, range->end - 1, &iterator)
                 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
- -                             iterator.level, range->arg.pte);
+ +                             iterator.level);
   
         return ret;
   }
@@@ -1567,8 -1596,22 +1567,8 @@@ bool kvm_unmap_gfn_range(struct kvm *kv
         return flush;
   }
   
- -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
- -{
- -      bool flush = false;
- -
- -      if (kvm_memslots_have_rmaps(kvm))
- -              flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
- -
- -      if (tdp_mmu_enabled)
- -              flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
- -
- -      return flush;
- -}
- -
   static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- -                       struct kvm_memory_slot *slot, gfn_t gfn, int level,
- -                       pte_t unused)
+ +                       struct kvm_memory_slot *slot, gfn_t gfn, int level)
   {
         u64 *sptep;
         struct rmap_iterator iter;
@@@ -1581,7 -1624,8 +1581,7 @@@
   }
   
   static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- -                            struct kvm_memory_slot *slot, gfn_t gfn,
- -                            int level, pte_t unused)
+ +                            struct kvm_memory_slot *slot, gfn_t gfn, int level)
   {
         u64 *sptep;
         struct rmap_iterator iter;
@@@ -1906,8 -1950,7 +1906,8 @@@ static bool kvm_sync_page_check(struct 
   
   static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
   {
- -      if (!sp->spt[i])
+ +      /* sp->spt[i] has initial value of shadow page table allocation */
+ +      if (sp->spt[i] == SHADOW_NONPRESENT_VALUE)
                 return 0;
   
         return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
@@@ -2471,7 -2514,7 +2471,7 @@@ static int mmu_page_zap_pte(struct kvm 
                                 return kvm_mmu_prepare_zap_page(kvm, child,
                                                                 invalid_list);
                 }
- -      } else if (is_mmio_spte(pte)) {
+ +      } else if (is_mmio_spte(kvm, pte)) {
                 mmu_spte_clear_no_track(spte);
         }
         return 0;
@@@ -3271,19 -3314,9 +3271,19 @@@ static int kvm_handle_noslot_fault(stru
   {
         gva_t gva = fault->is_tdp ? 0 : fault->addr;
   
+ +      if (fault->is_private) {
+ +              kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ +              return -EFAULT;
+ +      }
+ +
         vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
                              access & shadow_mmio_access_mask);
   
+ +      fault->slot = NULL;
+ +      fault->pfn = KVM_PFN_NOSLOT;
+ +      fault->map_writable = false;
+ +      fault->hva = KVM_HVA_ERR_BAD;
+ +
         /*
          * If MMIO caching is disabled, emulate immediately without
          * touching the shadow page tables as attempting to install an
@@@ -4163,7 -4196,7 +4163,7 @@@ static int handle_mmio_page_fault(struc
         if (WARN_ON_ONCE(reserved))
                 return -EINVAL;
   
- -      if (is_mmio_spte(spte)) {
+ +      if (is_mmio_spte(vcpu->kvm, spte)) {
                 gfn_t gfn = get_mmio_spte_gfn(spte);
                 unsigned int access = get_mmio_spte_access(spte);
   
@@@ -4226,28 -4259,24 +4226,28 @@@ static u32 alloc_apf_token(struct kvm_v
         return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
   }
   
- -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- -                                  gfn_t gfn)
+ +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
+ +                                  struct kvm_page_fault *fault)
   {
         struct kvm_arch_async_pf arch;
   
         arch.token = alloc_apf_token(vcpu);
- -      arch.gfn = gfn;
+ +      arch.gfn = fault->gfn;
+ +      arch.error_code = fault->error_code;
         arch.direct_map = vcpu->arch.mmu->root_role.direct;
         arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
   
- -      return kvm_setup_async_pf(vcpu, cr2_or_gpa,
- -                                kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+ +      return kvm_setup_async_pf(vcpu, fault->addr,
+ +                                kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
   }
   
   void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
   {
         int r;
   
+ +      if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
+ +              return;
+ +
         if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
               work->wakeup_all)
                 return;
@@@ -4260,7 -4289,7 +4260,7 @@@
               work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
                 return;
   
- -      kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
+ +      kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
   }
   
   static inline u8 kvm_max_level_for_order(int order)
@@@ -4280,6 -4309,14 +4280,6 @@@
         return PG_LEVEL_4K;
   }
   
- -static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
- -                                            struct kvm_page_fault *fault)
- -{
- -      kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
- -                                    PAGE_SIZE, fault->write, fault->exec,
- -                                    fault->is_private);
- -}
- -
   static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
                                    struct kvm_page_fault *fault)
   {
@@@ -4306,15 -4343,48 +4306,15 @@@
   
   static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
   {
- -      struct kvm_memory_slot *slot = fault->slot;
         bool async;
   
- -      /*
- -       * Retry the page fault if the gfn hit a memslot that is being deleted
- -       * or moved.  This ensures any existing SPTEs for the old memslot will
- -       * be zapped before KVM inserts a new MMIO SPTE for the gfn.
- -       */
- -      if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
- -              return RET_PF_RETRY;
- -
- -      if (!kvm_is_visible_memslot(slot)) {
- -              /* Don't expose private memslots to L2. */
- -              if (is_guest_mode(vcpu)) {
- -                      fault->slot = NULL;
- -                      fault->pfn = KVM_PFN_NOSLOT;
- -                      fault->map_writable = false;
- -                      return RET_PF_CONTINUE;
- -              }
- -              /*
- -               * If the APIC access page exists but is disabled, go directly
- -               * to emulation without caching the MMIO access or creating a
- -               * MMIO SPTE.  That way the cache doesn't need to be purged
- -               * when the AVIC is re-enabled.
- -               */
- -              if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
- -                  !kvm_apicv_activated(vcpu->kvm))
- -                      return RET_PF_EMULATE;
- -      }
- -
- -      if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
- -              kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
- -              return -EFAULT;
- -      }
- -
         if (fault->is_private)
                 return kvm_faultin_pfn_private(vcpu, fault);
   
         async = false;
- -      fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
- -                                        fault->write, &fault->map_writable,
- -                                        &fault->hva);
+ +      fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false,
+ +                                        &async, fault->write,
+ +                                        &fault->map_writable, &fault->hva);
         if (!async)
                 return RET_PF_CONTINUE; /* *pfn has correct page already */
   
@@@ -4324,7 -4394,7 +4324,7 @@@
                         trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
                         return RET_PF_RETRY;
- -              } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
+ +              } else if (kvm_arch_setup_async_pf(vcpu, fault)) {
                         return RET_PF_RETRY;
                 }
         }
@@@ -4334,72 -4404,17 +4334,72 @@@
          * to wait for IO.  Note, gup always bails if it is unable to quickly
          * get a page and a fatal signal, i.e. SIGKILL, is pending.
          */
- -      fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
- -                                        fault->write, &fault->map_writable,
- -                                        &fault->hva);
+ +      fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
+ +                                        NULL, fault->write,
+ +                                        &fault->map_writable, &fault->hva);
         return RET_PF_CONTINUE;
   }
   
   static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
                            unsigned int access)
   {
+ +      struct kvm_memory_slot *slot = fault->slot;
         int ret;
   
+ +      /*
+ +       * Note that the mmu_invalidate_seq also serves to detect a concurrent
+ +       * change in attributes.  is_page_fault_stale() will detect an
+ +       * invalidation relate to fault->fn and resume the guest without
+ +       * installing a mapping in the page tables.
+ +       */
+ +      fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ +      smp_rmb();
+ +
+ +      /*
+ +       * Now that we have a snapshot of mmu_invalidate_seq we can check for a
+ +       * private vs. shared mismatch.
+ +       */
+ +      if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+ +              kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ +              return -EFAULT;
+ +      }
+ +
+ +      if (unlikely(!slot))
+ +              return kvm_handle_noslot_fault(vcpu, fault, access);
+ +
+ +      /*
+ +       * Retry the page fault if the gfn hit a memslot that is being deleted
+ +       * or moved.  This ensures any existing SPTEs for the old memslot will
+ +       * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ +       */
+ +      if (slot->flags & KVM_MEMSLOT_INVALID)
+ +              return RET_PF_RETRY;
+ +
+ +      if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
+ +              /*
+ +               * Don't map L1's APIC access page into L2, KVM doesn't support
+ +               * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
+ +               * i.e. the access needs to be emulated.  Emulating access to
+ +               * L1's APIC is also correct if L1 is accelerating L2's own
+ +               * virtual APIC, but for some reason L1 also maps _L1's_ APIC
+ +               * into L2.  Note, vcpu_is_mmio_gpa() always treats access to
+ +               * the APIC as MMIO.  Allow an MMIO SPTE to be created, as KVM
+ +               * uses different roots for L1 vs. L2, i.e. there is no danger
+ +               * of breaking APICv/AVIC for L1.
+ +               */
+ +              if (is_guest_mode(vcpu))
+ +                      return kvm_handle_noslot_fault(vcpu, fault, access);
+ +
+ +              /*
+ +               * If the APIC access page exists but is disabled, go directly
+ +               * to emulation without caching the MMIO access or creating a
+ +               * MMIO SPTE.  That way the cache doesn't need to be purged
+ +               * when the AVIC is re-enabled.
+ +               */
+ +              if (!kvm_apicv_activated(vcpu->kvm))
+ +                      return RET_PF_EMULATE;
+ +      }
+ +
         fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
         smp_rmb();
   
@@@ -4424,7 -4439,8 +4424,7 @@@
          * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
          * to detect retry guarantees the worst case latency for the vCPU.
          */
- -      if (fault->slot &&
- -          mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
+ +      if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
                 return RET_PF_RETRY;
   
         ret = __kvm_faultin_pfn(vcpu, fault);
@@@ -4434,7 -4450,7 +4434,7 @@@
         if (unlikely(is_error_pfn(fault->pfn)))
                 return kvm_handle_error_pfn(vcpu, fault);
   
- -      if (unlikely(!fault->slot))
+ +      if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
                 return kvm_handle_noslot_fault(vcpu, fault, access);
   
         /*
@@@ -4545,16 -4561,6 +4545,16 @@@ int kvm_handle_page_fault(struct kvm_vc
         if (WARN_ON_ONCE(fault_address >> 32))
                 return -EFAULT;
   #endif
+ +      /*
+ +       * Legacy #PF exception only have a 32-bit error code.  Simply drop the
+ +       * upper bits as KVM doesn't use them for #PF (because they are never
+ +       * set), and to ensure there are no collisions with KVM-defined bits.
+ +       */
+ +      if (WARN_ON_ONCE(error_code >> 32))
+ +              error_code = lower_32_bits(error_code);
+ +
+ +      /* Ensure the above sanity check also covers KVM-defined flags. */
+ +      BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
   
         vcpu->arch.l1tf_flush_l1d = true;
         if (!flags) {
@@@ -4806,7 -4812,7 +4806,7 @@@ EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd)
   static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
                            unsigned int access)
   {
- -      if (unlikely(is_mmio_spte(*sptep))) {
+ +      if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
                 if (gfn != get_mmio_spte_gfn(*sptep)) {
                         mmu_spte_clear_no_track(sptep);
                         return true;
@@@ -4929,7 -4935,7 +4929,7 @@@ static void reset_guest_rsvds_bits_mask
                                 context->cpu_role.base.level, is_efer_nx(context),
                                 guest_can_use(vcpu, X86_FEATURE_GBPAGES),
                                 is_cr4_pse(context),
- -                              guest_cpuid_is_amd_or_hygon(vcpu));
+ +                              guest_cpuid_is_amd_compatible(vcpu));
   }
   
   static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
@@@ -5316,6 -5322,11 +5316,11 @@@ static inline int kvm_mmu_get_tdp_level
         return max_tdp_level;
   }
   
+ u8 kvm_mmu_get_max_tdp_level(void)
+ {
+       return tdp_root_level ? tdp_root_level : max_tdp_level;
+ }
+ 
   static union kvm_mmu_page_role
   kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
                                 union kvm_cpu_role cpu_role)
@@@ -5570,9 -5581,9 +5575,9 @@@ void kvm_mmu_after_set_cpuid(struct kvm
          * that problem is swept under the rug; KVM's CPUID API is horrific and
          * it's all but impossible to solve it without introducing a new API.
          */
- -      vcpu->arch.root_mmu.root_role.word = 0;
- -      vcpu->arch.guest_mmu.root_role.word = 0;
- -      vcpu->arch.nested_mmu.root_role.word = 0;
+ +      vcpu->arch.root_mmu.root_role.invalid = 1;
+ +      vcpu->arch.guest_mmu.root_role.invalid = 1;
+ +      vcpu->arch.nested_mmu.root_role.invalid = 1;
         vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
         vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
         vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
@@@ -5796,15 -5807,10 +5801,15 @@@ void kvm_mmu_track_write(struct kvm_vcp
         bool flush = false;
   
         /*
- -       * If we don't have indirect shadow pages, it means no page is
- -       * write-protected, so we can exit simply.
+ +       * When emulating guest writes, ensure the written value is visible to
+ +       * any task that is handling page faults before checking whether or not
+ +       * KVM is shadowing a guest PTE.  This ensures either KVM will create
+ +       * the correct SPTE in the page fault handler, or this task will see
+ +       * a non-zero indirect_shadow_pages.  Pairs with the smp_mb() in
+ +       * account_shadowed().
          */
- -      if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+ +      smp_mb();
+ +      if (!vcpu->kvm->arch.indirect_shadow_pages)
                 return;
   
         write_lock(&vcpu->kvm->mmu_lock);
@@@ -5845,35 -5851,30 +5850,35 @@@ int noinline kvm_mmu_page_fault(struct 
         int r, emulation_type = EMULTYPE_PF;
         bool direct = vcpu->arch.mmu->root_role.direct;
   
- -      /*
- -       * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
- -       * checks when emulating instructions that triggers implicit access.
- -       * WARN if hardware generates a fault with an error code that collides
- -       * with the KVM-defined value.  Clear the flag and continue on, i.e.
- -       * don't terminate the VM, as KVM can't possibly be relying on a flag
- -       * that KVM doesn't know about.
- -       */
- -      if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
- -              error_code &= ~PFERR_IMPLICIT_ACCESS;
- -
         if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
                 return RET_PF_RETRY;
   
+ +      /*
+ +       * Except for reserved faults (emulated MMIO is shared-only), set the
+ +       * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
+ +       * current attributes, which are the source of truth for such VMs.  Note,
+ +       * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
+ +       * currently supported nested virtualization (among many other things)
+ +       * for software-protected VMs.
+ +       */
+ +      if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
+ +          !(error_code & PFERR_RSVD_MASK) &&
+ +          vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
+ +          kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
+ +              error_code |= PFERR_PRIVATE_ACCESS;
+ +
         r = RET_PF_INVALID;
         if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ +              if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
+ +                      return -EFAULT;
+ +
                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
                 if (r == RET_PF_EMULATE)
                         goto emulate;
         }
   
         if (r == RET_PF_INVALID) {
- -              r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
- -                                        lower_32_bits(error_code), false,
+ +              r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
                                           &emulation_type);
                 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
                         return -EIO;
@@@ -6177,10 -6178,7 +6182,10 @@@ int kvm_mmu_create(struct kvm_vcpu *vcp
         vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
         vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
   
- -      vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+ +      vcpu->arch.mmu_shadow_page_cache.init_value =
+ +              SHADOW_NONPRESENT_VALUE;
+ +      if (!vcpu->arch.mmu_shadow_page_cache.init_value)
+ +              vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
   
         vcpu->arch.mmu = &vcpu->arch.root_mmu;
         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@@ -6323,7 -6321,6 +6328,7 @@@ static bool kvm_has_zapped_obsolete_pag
   
   void kvm_mmu_init_vm(struct kvm *kvm)
   {
+ +      kvm->arch.shadow_mmio_value = shadow_mmio_value;
         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
         INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
         INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
@@@ -7407,8 -7404,7 +7412,8 @@@ bool kvm_arch_post_set_memory_attribute
                          * by the memslot, KVM can't use a hugepage due to the
                          * misaligned address regardless of memory attributes.
                          */
- -                      if (gfn >= slot->base_gfn) {
+ +                      if (gfn >= slot->base_gfn &&
+ +                          gfn + nr_pages <= slot->base_gfn + slot->npages) {
                                 if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
                                         hugepage_clear_mixed(slot, gfn, level);
                                 else
diff --combined arch/x86/kvm/x86.c

index fda22b3800a10d67bcc64a0455ce7dcb6bfd3791,95a86ee871ff31dcf629241759b4689e729af813..082ac6d95a3a08160d54ec832d0c075c0bd71489
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -92,12 -92,9 +92,12 @@@
   #define MAX_IO_MSRS 256
   #define KVM_MAX_MCE_BANKS 32
   
- -struct kvm_caps kvm_caps __read_mostly = {
- -      .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
- -};
+ +/*
+ + * Note, kvm_caps fields should *never* have default values, all fields must be
+ + * recomputed from scratch during vendor module load, e.g. to account for a
+ + * vendor module being reloaded with different module parameters.
+ + */
+ +struct kvm_caps kvm_caps __read_mostly;
   EXPORT_SYMBOL_GPL(kvm_caps);
   
   #define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
@@@ -1624,7 -1621,7 +1624,7 @@@ static bool kvm_is_immutable_feature_ms
          ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
          ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
          ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
- -       ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR)
+ +       ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO)
   
   static u64 kvm_get_arch_capabilities(void)
   {
@@@ -2233,16 -2230,13 +2233,13 @@@ static int do_set_msr(struct kvm_vcpu *
         /*
          * Disallow writes to immutable feature MSRs after KVM_RUN.  KVM does
          * not support modifying the guest vCPU model on the fly, e.g. changing
-        * the nVMX capabilities while L2 is running is nonsensical.  Ignore
+        * the nVMX capabilities while L2 is running is nonsensical.  Allow
          * writes of the same value, e.g. to allow userspace to blindly stuff
          * all MSRs when emulating RESET.
          */
-       if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) {
-               if (do_get_msr(vcpu, index, &val) || *data != val)
-                       return -EINVAL;
- 
-               return 0;
-       }
+       if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) &&
+           (do_get_msr(vcpu, index, &val) || *data != val))
+               return -EINVAL;
   
         return kvm_set_msr_ignored_check(vcpu, index, *data, true);
   }
@@@ -3473,7 -3467,7 +3470,7 @@@ static bool is_mci_status_msr(u32 msr
   static bool can_set_mci_status(struct kvm_vcpu *vcpu)
   {
         /* McStatusWrEn enabled? */
- -      if (guest_cpuid_is_amd_or_hygon(vcpu))
+ +      if (guest_cpuid_is_amd_compatible(vcpu))
                 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
   
         return false;
@@@ -4632,7 -4626,9 +4629,7 @@@ static int kvm_ioctl_get_supported_hv_c
   
   static bool kvm_is_vm_type_supported(unsigned long type)
   {
- -      return type == KVM_X86_DEFAULT_VM ||
- -             (type == KVM_X86_SW_PROTECTED_VM &&
- -              IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
+ +      return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
   }
   
   int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@@ -4833,7 -4829,9 +4830,7 @@@
                 r = kvm_caps.has_notify_vmexit;
                 break;
         case KVM_CAP_VM_TYPES:
- -              r = BIT(KVM_X86_DEFAULT_VM);
- -              if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM))
- -                      r |= BIT(KVM_X86_SW_PROTECTED_VM);
+ +              r = kvm_caps.supported_vm_types;
                 break;
         default:
                 break;
@@@ -4841,44 -4839,46 +4838,44 @@@
         return r;
   }
   
- -static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
- -{
- -      void __user *uaddr = (void __user*)(unsigned long)attr->addr;
- -
- -      if ((u64)(unsigned long)uaddr != attr->addr)
- -              return ERR_PTR_USR(-EFAULT);
- -      return uaddr;
- -}
- -
- -static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+ +static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val)
   {
- -      u64 __user *uaddr = kvm_get_attr_addr(attr);
- -
- -      if (attr->group)
+ +      if (attr->group) {
+ +              if (kvm_x86_ops.dev_get_attr)
+ +                      return static_call(kvm_x86_dev_get_attr)(attr->group, attr->attr, val);
                 return -ENXIO;
- -
- -      if (IS_ERR(uaddr))
- -              return PTR_ERR(uaddr);
+ +      }
   
         switch (attr->attr) {
         case KVM_X86_XCOMP_GUEST_SUPP:
- -              if (put_user(kvm_caps.supported_xcr0, uaddr))
- -                      return -EFAULT;
+ +              *val = kvm_caps.supported_xcr0;
                 return 0;
         default:
                 return -ENXIO;
         }
   }
   
+ +static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+ +{
+ +      u64 __user *uaddr = u64_to_user_ptr(attr->addr);
+ +      int r;
+ +      u64 val;
+ +
+ +      r = __kvm_x86_dev_get_attr(attr, &val);
+ +      if (r < 0)
+ +              return r;
+ +
+ +      if (put_user(val, uaddr))
+ +              return -EFAULT;
+ +
+ +      return 0;
+ +}
+ +
   static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
   {
- -      if (attr->group)
- -              return -ENXIO;
+ +      u64 val;
   
- -      switch (attr->attr) {
- -      case KVM_X86_XCOMP_GUEST_SUPP:
- -              return 0;
- -      default:
- -              return -ENXIO;
- -      }
+ +      return __kvm_x86_dev_get_attr(attr, &val);
   }
   
   long kvm_arch_dev_ioctl(struct file *filp,
@@@ -5554,15 -5554,11 +5551,15 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
         return 0;
   }
   
- -static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
- -                                           struct kvm_debugregs *dbgregs)
+ +static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ +                                          struct kvm_debugregs *dbgregs)
   {
         unsigned int i;
   
+ +      if (vcpu->kvm->arch.has_protected_state &&
+ +          vcpu->arch.guest_state_protected)
+ +              return -EINVAL;
+ +
         memset(dbgregs, 0, sizeof(*dbgregs));
   
         BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
@@@ -5571,7 -5567,6 +5568,7 @@@
   
         dbgregs->dr6 = vcpu->arch.dr6;
         dbgregs->dr7 = vcpu->arch.dr7;
+ +      return 0;
   }
   
   static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@@ -5579,10 -5574,6 +5576,10 @@@
   {
         unsigned int i;
   
+ +      if (vcpu->kvm->arch.has_protected_state &&
+ +          vcpu->arch.guest_state_protected)
+ +              return -EINVAL;
+ +
         if (dbgregs->flags)
                 return -EINVAL;
   
@@@ -5603,8 -5594,8 +5600,8 @@@
   }
   
   
- -static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
- -                                        u8 *state, unsigned int size)
+ +static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+ +                                       u8 *state, unsigned int size)
   {
         /*
          * Only copy state for features that are enabled for the guest.  The
@@@ -5622,25 -5613,24 +5619,25 @@@
                              XFEATURE_MASK_FPSSE;
   
         if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- -              return;
+ +              return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
   
         fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
                                        supported_xcr0, vcpu->arch.pkru);
+ +      return 0;
   }
   
- -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
- -                                       struct kvm_xsave *guest_xsave)
+ +static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ +                                      struct kvm_xsave *guest_xsave)
   {
- -      kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
- -                                    sizeof(guest_xsave->region));
+ +      return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ +                                           sizeof(guest_xsave->region));
   }
   
   static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                                         struct kvm_xsave *guest_xsave)
   {
         if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- -              return 0;
+ +              return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
   
         return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
                                               guest_xsave->region,
@@@ -5648,23 -5638,18 +5645,23 @@@
                                               &vcpu->arch.pkru);
   }
   
- -static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
- -                                      struct kvm_xcrs *guest_xcrs)
+ +static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ +                                     struct kvm_xcrs *guest_xcrs)
   {
+ +      if (vcpu->kvm->arch.has_protected_state &&
+ +          vcpu->arch.guest_state_protected)
+ +              return -EINVAL;
+ +
         if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
                 guest_xcrs->nr_xcrs = 0;
- -              return;
+ +              return 0;
         }
   
         guest_xcrs->nr_xcrs = 1;
         guest_xcrs->flags = 0;
         guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
         guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+ +      return 0;
   }
   
   static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
@@@ -5672,10 -5657,6 +5669,10 @@@
   {
         int i, r = 0;
   
+ +      if (vcpu->kvm->arch.has_protected_state &&
+ +          vcpu->arch.guest_state_protected)
+ +              return -EINVAL;
+ +
         if (!boot_cpu_has(X86_FEATURE_XSAVE))
                 return -EINVAL;
   
@@@ -5728,9 -5709,12 +5725,9 @@@ static int kvm_arch_tsc_has_attr(struc
   static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
                                  struct kvm_device_attr *attr)
   {
- -      u64 __user *uaddr = kvm_get_attr_addr(attr);
+ +      u64 __user *uaddr = u64_to_user_ptr(attr->addr);
         int r;
   
- -      if (IS_ERR(uaddr))
- -              return PTR_ERR(uaddr);
- -
         switch (attr->attr) {
         case KVM_VCPU_TSC_OFFSET:
                 r = -EFAULT;
@@@ -5748,10 -5732,13 +5745,10 @@@
   static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
                                  struct kvm_device_attr *attr)
   {
- -      u64 __user *uaddr = kvm_get_attr_addr(attr);
+ +      u64 __user *uaddr = u64_to_user_ptr(attr->addr);
         struct kvm *kvm = vcpu->kvm;
         int r;
   
- -      if (IS_ERR(uaddr))
- -              return PTR_ERR(uaddr);
- -
         switch (attr->attr) {
         case KVM_VCPU_TSC_OFFSET: {
                 u64 offset, tsc, ns;
@@@ -6058,9 -6045,7 +6055,9 @@@ long kvm_arch_vcpu_ioctl(struct file *f
         case KVM_GET_DEBUGREGS: {
                 struct kvm_debugregs dbgregs;
   
- -              kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ +              r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ +              if (r < 0)
+ +                      break;
   
                 r = -EFAULT;
                 if (copy_to_user(argp, &dbgregs,
@@@ -6090,9 -6075,7 +6087,9 @@@
                 if (!u.xsave)
                         break;
   
- -              kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ +              r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ +              if (r < 0)
+ +                      break;
   
                 r = -EFAULT;
                 if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
@@@ -6121,9 -6104,7 +6118,9 @@@
                 if (!u.xsave)
                         break;
   
- -              kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ +              r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ +              if (r < 0)
+ +                      break;
   
                 r = -EFAULT;
                 if (copy_to_user(argp, u.xsave, size))
@@@ -6139,9 -6120,7 +6136,9 @@@
                 if (!u.xcrs)
                         break;
   
- -              kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ +              r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ +              if (r < 0)
+ +                      break;
   
                 r = -EFAULT;
                 if (copy_to_user(argp, u.xcrs,
@@@ -6285,11 -6264,6 +6282,11 @@@
         }
   #endif
         case KVM_GET_SREGS2: {
+ +              r = -EINVAL;
+ +              if (vcpu->kvm->arch.has_protected_state &&
+ +                  vcpu->arch.guest_state_protected)
+ +                      goto out;
+ +
                 u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
                 r = -ENOMEM;
                 if (!u.sregs2)
@@@ -6302,11 -6276,6 +6299,11 @@@
                 break;
         }
         case KVM_SET_SREGS2: {
+ +              r = -EINVAL;
+ +              if (vcpu->kvm->arch.has_protected_state &&
+ +                  vcpu->arch.guest_state_protected)
+ +                      goto out;
+ +
                 u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
                 if (IS_ERR(u.sregs2)) {
                         r = PTR_ERR(u.sregs2);
@@@ -9760,8 -9729,6 +9757,8 @@@ int kvm_x86_vendor_init(struct kvm_x86_
                 return -EIO;
         }
   
+ +      memset(&kvm_caps, 0, sizeof(kvm_caps));
+ +
         x86_emulator_cache = kvm_alloc_emulator_cache();
         if (!x86_emulator_cache) {
                 pr_err("failed to allocate cache for x86 emulator\n");
@@@ -9780,9 -9747,6 +9777,9 @@@
         if (r)
                 goto out_free_percpu;
   
+ +      kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
+ +      kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
+ +
         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
                 kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
@@@ -9828,9 -9792,6 +9825,9 @@@
   
         kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
   
+ +      if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled)
+ +              kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM);
+ +
         if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
                 kvm_caps.supported_xss = 0;
   
@@@ -10031,15 -9992,12 +10028,12 @@@ static void set_or_clear_apicv_inhibit(
   
   static void kvm_apicv_init(struct kvm *kvm)
   {
-       unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons;
+       enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT :
+                                                      APICV_INHIBIT_REASON_DISABLE;
   
-       init_rwsem(&kvm->arch.apicv_update_lock);
- 
-       set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true);
+       set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
   
-       if (!enable_apicv)
-               set_or_clear_apicv_inhibit(inhibits,
-                                          APICV_INHIBIT_REASON_DISABLE, true);
+       init_rwsem(&kvm->arch.apicv_update_lock);
   }
   
   static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
@@@ -10087,15 -10045,26 +10081,15 @@@ static int complete_hypercall_exit(stru
         return kvm_skip_emulated_instruction(vcpu);
   }
   
- -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+ +unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ +                                    unsigned long a0, unsigned long a1,
+ +                                    unsigned long a2, unsigned long a3,
+ +                                    int op_64_bit, int cpl)
   {
- -      unsigned long nr, a0, a1, a2, a3, ret;
- -      int op_64_bit;
- -
- -      if (kvm_xen_hypercall_enabled(vcpu->kvm))
- -              return kvm_xen_hypercall(vcpu);
- -
- -      if (kvm_hv_hypercall_enabled(vcpu))
- -              return kvm_hv_hypercall(vcpu);
- -
- -      nr = kvm_rax_read(vcpu);
- -      a0 = kvm_rbx_read(vcpu);
- -      a1 = kvm_rcx_read(vcpu);
- -      a2 = kvm_rdx_read(vcpu);
- -      a3 = kvm_rsi_read(vcpu);
+ +      unsigned long ret;
   
         trace_kvm_hypercall(nr, a0, a1, a2, a3);
   
- -      op_64_bit = is_64_bit_hypercall(vcpu);
         if (!op_64_bit) {
                 nr &= 0xFFFFFFFF;
                 a0 &= 0xFFFFFFFF;
@@@ -10104,7 -10073,7 +10098,7 @@@
                 a3 &= 0xFFFFFFFF;
         }
   
- -      if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
+ +      if (cpl) {
                 ret = -KVM_EPERM;
                 goto out;
         }
@@@ -10165,49 -10134,18 +10159,49 @@@
   
                 WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
                 vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+ +              /* stat is incremented on completion. */
                 return 0;
         }
         default:
                 ret = -KVM_ENOSYS;
                 break;
         }
+ +
   out:
+ +      ++vcpu->stat.hypercalls;
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
+ +
+ +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+ +{
+ +      unsigned long nr, a0, a1, a2, a3, ret;
+ +      int op_64_bit;
+ +      int cpl;
+ +
+ +      if (kvm_xen_hypercall_enabled(vcpu->kvm))
+ +              return kvm_xen_hypercall(vcpu);
+ +
+ +      if (kvm_hv_hypercall_enabled(vcpu))
+ +              return kvm_hv_hypercall(vcpu);
+ +
+ +      nr = kvm_rax_read(vcpu);
+ +      a0 = kvm_rbx_read(vcpu);
+ +      a1 = kvm_rcx_read(vcpu);
+ +      a2 = kvm_rdx_read(vcpu);
+ +      a3 = kvm_rsi_read(vcpu);
+ +      op_64_bit = is_64_bit_hypercall(vcpu);
+ +      cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ +
+ +      ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
+ +      if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
+ +              /* MAP_GPA tosses the request to the user space. */
+ +              return 0;
+ +
         if (!op_64_bit)
                 ret = (u32)ret;
         kvm_rax_write(vcpu, ret);
   
- -      ++vcpu->stat.hypercalls;
         return kvm_skip_emulated_instruction(vcpu);
   }
   EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@@ -11542,10 -11480,6 +11536,10 @@@ static void __get_regs(struct kvm_vcpu 
   
   int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
   {
+ +      if (vcpu->kvm->arch.has_protected_state &&
+ +          vcpu->arch.guest_state_protected)
+ +              return -EINVAL;
+ +
         vcpu_load(vcpu);
         __get_regs(vcpu, regs);
         vcpu_put(vcpu);
@@@ -11587,10 -11521,6 +11581,10 @@@ static void __set_regs(struct kvm_vcpu 
   
   int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
   {
+ +      if (vcpu->kvm->arch.has_protected_state &&
+ +          vcpu->arch.guest_state_protected)
+ +              return -EINVAL;
+ +
         vcpu_load(vcpu);
         __set_regs(vcpu, regs);
         vcpu_put(vcpu);
@@@ -11663,10 -11593,6 +11657,10 @@@ static void __get_sregs2(struct kvm_vcp
   int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                   struct kvm_sregs *sregs)
   {
+ +      if (vcpu->kvm->arch.has_protected_state &&
+ +          vcpu->arch.guest_state_protected)
+ +              return -EINVAL;
+ +
         vcpu_load(vcpu);
         __get_sregs(vcpu, sregs);
         vcpu_put(vcpu);
@@@ -11934,10 -11860,6 +11928,10 @@@ int kvm_arch_vcpu_ioctl_set_sregs(struc
   {
         int ret;
   
+ +      if (vcpu->kvm->arch.has_protected_state &&
+ +          vcpu->arch.guest_state_protected)
+ +              return -EINVAL;
+ +
         vcpu_load(vcpu);
         ret = __set_sregs(vcpu, sregs);
         vcpu_put(vcpu);
@@@ -12055,7 -11977,7 +12049,7 @@@ int kvm_arch_vcpu_ioctl_get_fpu(struct 
         struct fxregs_state *fxsave;
   
         if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- -              return 0;
+ +              return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
   
         vcpu_load(vcpu);
   
@@@ -12078,7 -12000,7 +12072,7 @@@ int kvm_arch_vcpu_ioctl_set_fpu(struct 
         struct fxregs_state *fxsave;
   
         if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- -              return 0;
+ +              return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
   
         vcpu_load(vcpu);
   
@@@ -12604,8 -12526,6 +12598,8 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
                 return -EINVAL;
   
         kvm->arch.vm_type = type;
+ +      kvm->arch.has_private_mem =
+ +              (type == KVM_X86_SW_PROTECTED_VM);
   
         ret = kvm_page_track_init(kvm);
         if (ret)
@@@ -12805,7 -12725,7 +12799,7 @@@ static void memslot_rmap_free(struct kv
         int i;
   
         for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-               kvfree(slot->arch.rmap[i]);
+               vfree(slot->arch.rmap[i]);
                 slot->arch.rmap[i] = NULL;
         }
   }
@@@ -12817,7 -12737,7 +12811,7 @@@ void kvm_arch_free_memslot(struct kvm *
         memslot_rmap_free(slot);
   
         for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
-               kvfree(slot->arch.lpage_info[i - 1]);
+               vfree(slot->arch.lpage_info[i - 1]);
                 slot->arch.lpage_info[i - 1] = NULL;
         }
   
@@@ -12909,7 -12829,7 +12903,7 @@@ out_free
         memslot_rmap_free(slot);
   
         for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
-               kvfree(slot->arch.lpage_info[i - 1]);
+               vfree(slot->arch.lpage_info[i - 1]);
                 slot->arch.lpage_info[i - 1] = NULL;
         }
         return -ENOMEM;
diff --combined virt/kvm/kvm_main.c

index fb86ec20ebc4c418159f7c8a01dd10f2ab5d1ca3,711970d385f5b2bb5b5fff80b5b1b631ef269100..a1756d5077ee24eb25c3cfbf4f6708bce2850277
--- 1/virt/kvm/kvm_main.c
--- 2/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -311,7 -311,8 +311,7 @@@ bool kvm_make_vcpus_request_mask(struc
         return called;
   }
   
- -bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
- -                                    struct kvm_vcpu *except)
+ +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
   {
         struct kvm_vcpu *vcpu;
         struct cpumask *cpus;
@@@ -324,14 -325,22 +324,14 @@@
         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
         cpumask_clear(cpus);
   
- -      kvm_for_each_vcpu(i, vcpu, kvm) {
- -              if (vcpu == except)
- -                      continue;
+ +      kvm_for_each_vcpu(i, vcpu, kvm)
                 kvm_make_vcpu_request(vcpu, req, cpus, me);
- -      }
   
         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
         put_cpu();
   
         return called;
   }
- -
- -bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
- -{
- -      return kvm_make_all_cpus_request_except(kvm, req, NULL);
- -}
   EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
   
   void kvm_flush_remote_tlbs(struct kvm *kvm)
@@@ -392,17 -401,12 +392,17 @@@ static void kvm_flush_shadow_all(struc
   static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
                                                gfp_t gfp_flags)
   {
+ +      void *page;
+ +
         gfp_flags |= mc->gfp_zero;
   
         if (mc->kmem_cache)
                 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
- -      else
- -              return (void *)__get_free_page(gfp_flags);
+ +
+ +      page = (void *)__get_free_page(gfp_flags);
+ +      if (page && mc->init_value)
+ +              memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
+ +      return page;
   }
   
   int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
@@@ -417,13 -421,6 +417,13 @@@
                 if (WARN_ON_ONCE(!capacity))
                         return -EIO;
   
+ +              /*
+ +               * Custom init values can be used only for page allocations,
+ +               * and obviously conflict with __GFP_ZERO.
+ +               */
+ +              if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
+ +                      return -EIO;
+ +
                 mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
                 if (!mc->objects)
                         return -ENOMEM;
@@@ -586,6 -583,8 +586,6 @@@ static void kvm_null_fn(void
   }
   #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
   
- -static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
- -
   /* Iterate over each memslot intersecting [start, last] (inclusive) range */
   #define kvm_for_each_memslot_in_hva_range(node, slots, start, last)        \
         for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
@@@ -671,12 -670,14 +671,12 @@@ static __always_inline kvm_mn_ret_t __k
   static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
                                                 unsigned long start,
                                                 unsigned long end,
- -                                              union kvm_mmu_notifier_arg arg,
                                                 gfn_handler_t handler)
   {
         struct kvm *kvm = mmu_notifier_to_kvm(mn);
         const struct kvm_mmu_notifier_range range = {
                 .start          = start,
                 .end            = end,
- -              .arg            = arg,
                 .handler        = handler,
                 .on_lock        = (void *)kvm_null_fn,
                 .flush_on_ret   = true,
@@@ -704,6 -705,48 +704,6 @@@ static __always_inline int kvm_handle_h
         return __kvm_handle_hva_range(kvm, &range).ret;
   }
   
- -static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
- -{
- -      /*
- -       * Skipping invalid memslots is correct if and only change_pte() is
- -       * surrounded by invalidate_range_{start,end}(), which is currently
- -       * guaranteed by the primary MMU.  If that ever changes, KVM needs to
- -       * unmap the memslot instead of skipping the memslot to ensure that KVM
- -       * doesn't hold references to the old PFN.
- -       */
- -      WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
- -
- -      if (range->slot->flags & KVM_MEMSLOT_INVALID)
- -              return false;
- -
- -      return kvm_set_spte_gfn(kvm, range);
- -}
- -
- -static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
- -                                      struct mm_struct *mm,
- -                                      unsigned long address,
- -                                      pte_t pte)
- -{
- -      struct kvm *kvm = mmu_notifier_to_kvm(mn);
- -      const union kvm_mmu_notifier_arg arg = { .pte = pte };
- -
- -      trace_kvm_set_spte_hva(address);
- -
- -      /*
- -       * .change_pte() must be surrounded by .invalidate_range_{start,end}().
- -       * If mmu_invalidate_in_progress is zero, then no in-progress
- -       * invalidations, including this one, found a relevant memslot at
- -       * start(); rechecking memslots here is unnecessary.  Note, a false
- -       * positive (count elevated by a different invalidation) is sub-optimal
- -       * but functionally ok.
- -       */
- -      WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
- -      if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
- -              return;
- -
- -      kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
- -}
- -
   void kvm_mmu_invalidate_begin(struct kvm *kvm)
   {
         lockdep_assert_held_write(&kvm->mmu_lock);
@@@ -789,7 -832,8 +789,7 @@@ static int kvm_mmu_notifier_invalidate_
          * mn_active_invalidate_count (see above) instead of
          * mmu_invalidate_in_progress.
          */
- -      gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
- -                                        hva_range.may_block);
+ +      gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
   
         /*
          * If one or more memslots were found and thus zapped, notify arch code
@@@ -866,7 -910,8 +866,7 @@@ static int kvm_mmu_notifier_clear_flush
   {
         trace_kvm_age_hva(start, end);
   
- -      return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
- -                                  kvm_age_gfn);
+ +      return kvm_handle_hva_range(mn, start, end, kvm_age_gfn);
   }
   
   static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@@ -919,6 -964,7 +919,6 @@@ static const struct mmu_notifier_ops kv
         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
         .clear_young            = kvm_mmu_notifier_clear_young,
         .test_young             = kvm_mmu_notifier_test_young,
- -      .change_pte             = kvm_mmu_notifier_change_pte,
         .release                = kvm_mmu_notifier_release,
   };
   
@@@ -974,7 -1020,7 +974,7 @@@ static void kvm_destroy_dirty_bitmap(st
         if (!memslot->dirty_bitmap)
                 return;
   
-       kvfree(memslot->dirty_bitmap);
+       vfree(memslot->dirty_bitmap);
         memslot->dirty_bitmap = NULL;
   }
   
@@@ -1283,12 -1329,6 +1283,12 @@@ static void kvm_destroy_devices(struct 
          * We do not need to take the kvm->lock here, because nobody else
          * has a reference to the struct kvm at this point and therefore
          * cannot access the devices list anyhow.
+ +       *
+ +       * The device list is generally managed as an rculist, but list_del()
+ +       * is used intentionally here. If a bug in KVM introduced a reader that
+ +       * was not backed by a reference on the kvm struct, the hope is that
+ +       * it'd consume the poisoned forward pointer instead of suffering a
+ +       * use-after-free, even though this cannot be guaranteed.
          */
         list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
                 list_del(&dev->vm_node);
@@@ -2923,7 -2963,7 +2923,7 @@@ out
   /*
    * Pin guest page in memory and return its pfn.
    * @addr: host virtual address which maps memory to the guest
- - * @atomic: whether this function can sleep
+ + * @atomic: whether this function is forbidden from sleeping
    * @interruptible: whether the process can be interrupted by non-fatal signals
    * @async: whether this function need to wait IO complete if the
    *         host page is not in the memory
@@@ -2995,12 -3035,16 +2995,12 @@@ kvm_pfn_t __gfn_to_pfn_memslot(const st
         if (hva)
                 *hva = addr;
   
- -      if (addr == KVM_HVA_ERR_RO_BAD) {
- -              if (writable)
- -                      *writable = false;
- -              return KVM_PFN_ERR_RO_FAULT;
- -      }
- -
         if (kvm_is_error_hva(addr)) {
                 if (writable)
                         *writable = false;
- -              return KVM_PFN_NOSLOT;
+ +
+ +              return addr == KVM_HVA_ERR_RO_BAD ? KVM_PFN_ERR_RO_FAULT :
+ +                                                  KVM_PFN_NOSLOT;
         }
   
         /* Do not map writable pfn in the readonly memslot. */
@@@ -3264,7 -3308,6 +3264,7 @@@ static int next_segment(unsigned long l
                 return len;
   }
   
+ +/* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
   static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
                                  void *data, int offset, int len)
   {
@@@ -3366,7 -3409,6 +3366,7 @@@ int kvm_vcpu_read_guest_atomic(struct k
   }
   EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
   
+ +/* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
   static int __kvm_write_guest_page(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot, gfn_t gfn,
                                   const void *data, int offset, int len)
@@@ -4683,8 -4725,7 +4683,8 @@@ static int kvm_device_release(struct in
   
         if (dev->ops->release) {
                 mutex_lock(&kvm->lock);
- -              list_del(&dev->vm_node);
+ +              list_del_rcu(&dev->vm_node);
+ +              synchronize_rcu();
                 dev->ops->release(dev);
                 mutex_unlock(&kvm->lock);
         }
@@@ -4767,7 -4808,7 +4767,7 @@@ static int kvm_ioctl_create_device(stru
                 kfree(dev);
                 return ret;
         }
- -      list_add(&dev->vm_node, &kvm->devices);
+ +      list_add_rcu(&dev->vm_node, &kvm->devices);
         mutex_unlock(&kvm->lock);
   
         if (ops->init)
@@@ -4778,8 -4819,7 +4778,8 @@@
         if (ret < 0) {
                 kvm_put_kvm_no_destroy(kvm);
                 mutex_lock(&kvm->lock);
- -              list_del(&dev->vm_node);
+ +              list_del_rcu(&dev->vm_node);
+ +              synchronize_rcu();
                 if (ops->release)
                         ops->release(dev);
                 mutex_unlock(&kvm->lock);
author	Paolo Bonzini <[email protected]>
	Sun, 12 May 2024 07:18:44 +0000 (03:18 -0400)
committer	Paolo Bonzini <[email protected]>
	Sun, 12 May 2024 07:18:44 +0000 (03:18 -0400)
		1	2
arch/x86/kvm/cpuid.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/kvm_main.c	patch \|	diff1 \|	diff2 \|	blob \| history