From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 12 May 2024 07:18:44 +0000 (-0400)
Subject: Merge tag 'kvm-x86-misc-6.10' of https://github.com/kvm-x86/linux into HEAD
X-Git-Tag: v6.10-rc1~139^2~1
X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/7d41e24da29a83acc52a78a68aa515dd76e41cc1?hp=-c

Merge tag 'kvm-x86-misc-6.10' of https://github.com/kvm-x86/linux into HEAD

KVM x86 misc changes for 6.10:

 - Advertise the max mappable GPA in the "guest MAXPHYADDR" CPUID field, which
   is unused by hardware, so that KVM can communicate its inability to map GPAs
   that set bits 51:48 due to lack of 5-level paging.  Guest firmware is
   expected to use the information to safely remap BARs in the uppermost GPA
   space, i.e to avoid placing a BAR at a legal, but unmappable, GPA.

 - Use vfree() instead of kvfree() for allocations that always use vcalloc()
   or __vcalloc().

 - Don't completely ignore same-value writes to immutable feature MSRs, as
   doing so results in KVM failing to reject accesses to MSR that aren't
   supposed to exist given the vCPU model and/or KVM configuration.

 - Don't mark APICv as being inhibited due to ABSENT if APICv is disabled
   KVM-wide to avoid confusing debuggers (KVM will never bother clearing the
   ABSENT inhibit, even if userspace enables in-kernel local APIC).
---

7d41e24da29a83acc52a78a68aa515dd76e41cc1
diff --combined arch/x86/kvm/cpuid.c
index 1851b3870a9c,1c5583addc90..f2f2be5d1141
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@@ -376,7 -376,6 +376,7 @@@ static void kvm_vcpu_after_set_cpuid(st
  
  	kvm_update_pv_runtime(vcpu);
  
 +	vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu);
  	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
  	vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
  
@@@ -772,7 -771,7 +772,7 @@@ void kvm_set_cpu_caps(void
  	kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
  
  	kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
 -		0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
 +		0 /* SME */ | 0 /* SEV */ | 0 /* VM_PAGE_FLUSH */ | 0 /* SEV_ES */ |
  		F(SME_COHERENT));
  
  	kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
@@@ -1232,9 -1231,22 +1232,22 @@@ static inline int __do_cpuid_func(struc
  		entry->eax = entry->ebx = entry->ecx = 0;
  		break;
  	case 0x80000008: {
- 		unsigned g_phys_as = (entry->eax >> 16) & 0xff;
- 		unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
- 		unsigned phys_as = entry->eax & 0xff;
+ 		/*
+ 		 * GuestPhysAddrSize (EAX[23:16]) is intended for software
+ 		 * use.
+ 		 *
+ 		 * KVM's ABI is to report the effective MAXPHYADDR for the
+ 		 * guest in PhysAddrSize (phys_as), and the maximum
+ 		 * *addressable* GPA in GuestPhysAddrSize (g_phys_as).
+ 		 *
+ 		 * GuestPhysAddrSize is valid if and only if TDP is enabled,
+ 		 * in which case the max GPA that can be addressed by KVM may
+ 		 * be less than the max GPA that can be legally generated by
+ 		 * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't
+ 		 * support 5-level TDP.
+ 		 */
+ 		unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U);
+ 		unsigned int phys_as, g_phys_as;
  
  		/*
  		 * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
@@@ -1242,16 -1254,24 +1255,24 @@@
  		 * reductions in MAXPHYADDR for memory encryption affect shadow
  		 * paging, too.
  		 *
- 		 * If TDP is enabled but an explicit guest MAXPHYADDR is not
- 		 * provided, use the raw bare metal MAXPHYADDR as reductions to
- 		 * the HPAs do not affect GPAs.
+ 		 * If TDP is enabled, use the raw bare metal MAXPHYADDR as
+ 		 * reductions to the HPAs do not affect GPAs.  The max
+ 		 * addressable GPA is the same as the max effective GPA, except
+ 		 * that it's capped at 48 bits if 5-level TDP isn't supported
+ 		 * (hardware processes bits 51:48 only when walking the fifth
+ 		 * level page table).
  		 */
- 		if (!tdp_enabled)
- 			g_phys_as = boot_cpu_data.x86_phys_bits;
- 		else if (!g_phys_as)
+ 		if (!tdp_enabled) {
+ 			phys_as = boot_cpu_data.x86_phys_bits;
+ 			g_phys_as = 0;
+ 		} else {
+ 			phys_as = entry->eax & 0xff;
  			g_phys_as = phys_as;
+ 			if (kvm_mmu_get_max_tdp_level() < 5)
+ 				g_phys_as = min(g_phys_as, 48);
+ 		}
  
- 		entry->eax = g_phys_as | (virt_as << 8);
+ 		entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16);
  		entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
  		entry->edx = 0;
  		cpuid_entry_override(entry, CPUID_8000_0008_EBX);
diff --combined arch/x86/kvm/mmu.h
index 2343c9f00e31,b410a227c601..2e454316f2a2
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@@ -100,6 -100,8 +100,8 @@@ static inline u8 kvm_get_shadow_phys_bi
  	return boot_cpu_data.x86_phys_bits;
  }
  
+ u8 kvm_mmu_get_max_tdp_level(void);
+ 
  void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
  void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
  void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
@@@ -213,7 -215,7 +215,7 @@@ static inline u8 permission_fault(struc
  	 */
  	u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
  	bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
 -	int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
 +	int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
  	u32 errcode = PFERR_PRESENT_MASK;
  	bool fault;
  
@@@ -234,7 -236,8 +236,7 @@@
  		pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
  
  		/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
 -		offset = (pfec & ~1) +
 -			((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
 +		offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);
  
  		pkru_bits &= mmu->pkru_mask >> offset;
  		errcode |= -pkru_bits & PFERR_PK_MASK;
diff --combined arch/x86/kvm/mmu/mmu.c
index 99f7b2f3d82a,db3a26eb7b75..662f62dfb2aa
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@@ -432,8 -432,8 +432,8 @@@ static u64 __update_clear_spte_slow(u6
   * The idea using the light way get the spte on x86_32 guest is from
   * gup_get_pte (mm/gup.c).
   *
 - * An spte tlb flush may be pending, because kvm_set_pte_rmap
 - * coalesces them and we are running out of the MMU lock.  Therefore
 + * An spte tlb flush may be pending, because they are coalesced and
 + * we are running out of the MMU lock.  Therefore
   * we need to protect against in-progress updates of the spte.
   *
   * Reading the spte while an update is in progress may get the old value
@@@ -567,9 -567,9 +567,9 @@@ static u64 mmu_spte_clear_track_bits(st
  
  	if (!is_shadow_present_pte(old_spte) ||
  	    !spte_has_volatile_bits(old_spte))
 -		__update_clear_spte_fast(sptep, 0ull);
 +		__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
  	else
 -		old_spte = __update_clear_spte_slow(sptep, 0ull);
 +		old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
  
  	if (!is_shadow_present_pte(old_spte))
  		return old_spte;
@@@ -603,7 -603,7 +603,7 @@@
   */
  static void mmu_spte_clear_no_track(u64 *sptep)
  {
 -	__update_clear_spte_fast(sptep, 0ull);
 +	__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
  }
  
  static u64 mmu_spte_get_lockless(u64 *sptep)
@@@ -831,15 -831,6 +831,15 @@@ static void account_shadowed(struct kv
  	gfn_t gfn;
  
  	kvm->arch.indirect_shadow_pages++;
 +	/*
 +	 * Ensure indirect_shadow_pages is elevated prior to re-reading guest
 +	 * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight
 +	 * emulated writes are visible before re-reading guest PTEs, or that
 +	 * an emulated write will see the elevated count and acquire mmu_lock
 +	 * to update SPTEs.  Pairs with the smp_mb() in kvm_mmu_track_write().
 +	 */
 +	smp_mb();
 +
  	gfn = sp->gfn;
  	slots = kvm_memslots_for_spte_role(kvm, sp->role);
  	slot = __gfn_to_memslot(slots, gfn);
@@@ -1457,11 -1448,49 +1457,11 @@@ static bool __kvm_zap_rmap(struct kvm *
  }
  
  static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 -			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
 -			 pte_t unused)
 +			 struct kvm_memory_slot *slot, gfn_t gfn, int level)
  {
  	return __kvm_zap_rmap(kvm, rmap_head, slot);
  }
  
 -static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 -			     struct kvm_memory_slot *slot, gfn_t gfn, int level,
 -			     pte_t pte)
 -{
 -	u64 *sptep;
 -	struct rmap_iterator iter;
 -	bool need_flush = false;
 -	u64 new_spte;
 -	kvm_pfn_t new_pfn;
 -
 -	WARN_ON_ONCE(pte_huge(pte));
 -	new_pfn = pte_pfn(pte);
 -
 -restart:
 -	for_each_rmap_spte(rmap_head, &iter, sptep) {
 -		need_flush = true;
 -
 -		if (pte_write(pte)) {
 -			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
 -			goto restart;
 -		} else {
 -			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
 -					*sptep, new_pfn);
 -
 -			mmu_spte_clear_track_bits(kvm, sptep);
 -			mmu_spte_set(sptep, new_spte);
 -		}
 -	}
 -
 -	if (need_flush && kvm_available_flush_remote_tlbs_range()) {
 -		kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
 -		return false;
 -	}
 -
 -	return need_flush;
 -}
 -
  struct slot_rmap_walk_iterator {
  	/* input fields. */
  	const struct kvm_memory_slot *slot;
@@@ -1533,7 -1562,7 +1533,7 @@@ static void slot_rmap_walk_next(struct 
  
  typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
  			       struct kvm_memory_slot *slot, gfn_t gfn,
 -			       int level, pte_t pte);
 +			       int level);
  
  static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
  						 struct kvm_gfn_range *range,
@@@ -1545,7 -1574,7 +1545,7 @@@
  	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
  				 range->start, range->end - 1, &iterator)
  		ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
 -			       iterator.level, range->arg.pte);
 +			       iterator.level);
  
  	return ret;
  }
@@@ -1567,8 -1596,22 +1567,8 @@@ bool kvm_unmap_gfn_range(struct kvm *kv
  	return flush;
  }
  
 -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 -{
 -	bool flush = false;
 -
 -	if (kvm_memslots_have_rmaps(kvm))
 -		flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
 -
 -	if (tdp_mmu_enabled)
 -		flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
 -
 -	return flush;
 -}
 -
  static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 -			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
 -			 pte_t unused)
 +			 struct kvm_memory_slot *slot, gfn_t gfn, int level)
  {
  	u64 *sptep;
  	struct rmap_iterator iter;
@@@ -1581,7 -1624,8 +1581,7 @@@
  }
  
  static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 -			      struct kvm_memory_slot *slot, gfn_t gfn,
 -			      int level, pte_t unused)
 +			      struct kvm_memory_slot *slot, gfn_t gfn, int level)
  {
  	u64 *sptep;
  	struct rmap_iterator iter;
@@@ -1906,8 -1950,7 +1906,8 @@@ static bool kvm_sync_page_check(struct 
  
  static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
  {
 -	if (!sp->spt[i])
 +	/* sp->spt[i] has initial value of shadow page table allocation */
 +	if (sp->spt[i] == SHADOW_NONPRESENT_VALUE)
  		return 0;
  
  	return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
@@@ -2471,7 -2514,7 +2471,7 @@@ static int mmu_page_zap_pte(struct kvm 
  				return kvm_mmu_prepare_zap_page(kvm, child,
  								invalid_list);
  		}
 -	} else if (is_mmio_spte(pte)) {
 +	} else if (is_mmio_spte(kvm, pte)) {
  		mmu_spte_clear_no_track(spte);
  	}
  	return 0;
@@@ -3271,19 -3314,9 +3271,19 @@@ static int kvm_handle_noslot_fault(stru
  {
  	gva_t gva = fault->is_tdp ? 0 : fault->addr;
  
 +	if (fault->is_private) {
 +		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
 +		return -EFAULT;
 +	}
 +
  	vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
  			     access & shadow_mmio_access_mask);
  
 +	fault->slot = NULL;
 +	fault->pfn = KVM_PFN_NOSLOT;
 +	fault->map_writable = false;
 +	fault->hva = KVM_HVA_ERR_BAD;
 +
  	/*
  	 * If MMIO caching is disabled, emulate immediately without
  	 * touching the shadow page tables as attempting to install an
@@@ -4163,7 -4196,7 +4163,7 @@@ static int handle_mmio_page_fault(struc
  	if (WARN_ON_ONCE(reserved))
  		return -EINVAL;
  
 -	if (is_mmio_spte(spte)) {
 +	if (is_mmio_spte(vcpu->kvm, spte)) {
  		gfn_t gfn = get_mmio_spte_gfn(spte);
  		unsigned int access = get_mmio_spte_access(spte);
  
@@@ -4226,28 -4259,24 +4226,28 @@@ static u32 alloc_apf_token(struct kvm_v
  	return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
  }
  
 -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 -				    gfn_t gfn)
 +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
 +				    struct kvm_page_fault *fault)
  {
  	struct kvm_arch_async_pf arch;
  
  	arch.token = alloc_apf_token(vcpu);
 -	arch.gfn = gfn;
 +	arch.gfn = fault->gfn;
 +	arch.error_code = fault->error_code;
  	arch.direct_map = vcpu->arch.mmu->root_role.direct;
  	arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
  
 -	return kvm_setup_async_pf(vcpu, cr2_or_gpa,
 -				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 +	return kvm_setup_async_pf(vcpu, fault->addr,
 +				  kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
  }
  
  void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
  {
  	int r;
  
 +	if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
 +		return;
 +
  	if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
  	      work->wakeup_all)
  		return;
@@@ -4260,7 -4289,7 +4260,7 @@@
  	      work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
  		return;
  
 -	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
 +	kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
  }
  
  static inline u8 kvm_max_level_for_order(int order)
@@@ -4280,6 -4309,14 +4280,6 @@@
  	return PG_LEVEL_4K;
  }
  
 -static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
 -					      struct kvm_page_fault *fault)
 -{
 -	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
 -				      PAGE_SIZE, fault->write, fault->exec,
 -				      fault->is_private);
 -}
 -
  static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
  				   struct kvm_page_fault *fault)
  {
@@@ -4306,15 -4343,48 +4306,15 @@@
  
  static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
  {
 -	struct kvm_memory_slot *slot = fault->slot;
  	bool async;
  
 -	/*
 -	 * Retry the page fault if the gfn hit a memslot that is being deleted
 -	 * or moved.  This ensures any existing SPTEs for the old memslot will
 -	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
 -	 */
 -	if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
 -		return RET_PF_RETRY;
 -
 -	if (!kvm_is_visible_memslot(slot)) {
 -		/* Don't expose private memslots to L2. */
 -		if (is_guest_mode(vcpu)) {
 -			fault->slot = NULL;
 -			fault->pfn = KVM_PFN_NOSLOT;
 -			fault->map_writable = false;
 -			return RET_PF_CONTINUE;
 -		}
 -		/*
 -		 * If the APIC access page exists but is disabled, go directly
 -		 * to emulation without caching the MMIO access or creating a
 -		 * MMIO SPTE.  That way the cache doesn't need to be purged
 -		 * when the AVIC is re-enabled.
 -		 */
 -		if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
 -		    !kvm_apicv_activated(vcpu->kvm))
 -			return RET_PF_EMULATE;
 -	}
 -
 -	if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
 -		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
 -		return -EFAULT;
 -	}
 -
  	if (fault->is_private)
  		return kvm_faultin_pfn_private(vcpu, fault);
  
  	async = false;
 -	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
 -					  fault->write, &fault->map_writable,
 -					  &fault->hva);
 +	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false,
 +					  &async, fault->write,
 +					  &fault->map_writable, &fault->hva);
  	if (!async)
  		return RET_PF_CONTINUE; /* *pfn has correct page already */
  
@@@ -4324,7 -4394,7 +4324,7 @@@
  			trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
  			kvm_make_request(KVM_REQ_APF_HALT, vcpu);
  			return RET_PF_RETRY;
 -		} else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
 +		} else if (kvm_arch_setup_async_pf(vcpu, fault)) {
  			return RET_PF_RETRY;
  		}
  	}
@@@ -4334,72 -4404,17 +4334,72 @@@
  	 * to wait for IO.  Note, gup always bails if it is unable to quickly
  	 * get a page and a fatal signal, i.e. SIGKILL, is pending.
  	 */
 -	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
 -					  fault->write, &fault->map_writable,
 -					  &fault->hva);
 +	fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
 +					  NULL, fault->write,
 +					  &fault->map_writable, &fault->hva);
  	return RET_PF_CONTINUE;
  }
  
  static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
  			   unsigned int access)
  {
 +	struct kvm_memory_slot *slot = fault->slot;
  	int ret;
  
 +	/*
 +	 * Note that the mmu_invalidate_seq also serves to detect a concurrent
 +	 * change in attributes.  is_page_fault_stale() will detect an
 +	 * invalidation relate to fault->fn and resume the guest without
 +	 * installing a mapping in the page tables.
 +	 */
 +	fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 +	smp_rmb();
 +
 +	/*
 +	 * Now that we have a snapshot of mmu_invalidate_seq we can check for a
 +	 * private vs. shared mismatch.
 +	 */
 +	if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
 +		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
 +		return -EFAULT;
 +	}
 +
 +	if (unlikely(!slot))
 +		return kvm_handle_noslot_fault(vcpu, fault, access);
 +
 +	/*
 +	 * Retry the page fault if the gfn hit a memslot that is being deleted
 +	 * or moved.  This ensures any existing SPTEs for the old memslot will
 +	 * be zapped before KVM inserts a new MMIO SPTE for the gfn.
 +	 */
 +	if (slot->flags & KVM_MEMSLOT_INVALID)
 +		return RET_PF_RETRY;
 +
 +	if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
 +		/*
 +		 * Don't map L1's APIC access page into L2, KVM doesn't support
 +		 * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
 +		 * i.e. the access needs to be emulated.  Emulating access to
 +		 * L1's APIC is also correct if L1 is accelerating L2's own
 +		 * virtual APIC, but for some reason L1 also maps _L1's_ APIC
 +		 * into L2.  Note, vcpu_is_mmio_gpa() always treats access to
 +		 * the APIC as MMIO.  Allow an MMIO SPTE to be created, as KVM
 +		 * uses different roots for L1 vs. L2, i.e. there is no danger
 +		 * of breaking APICv/AVIC for L1.
 +		 */
 +		if (is_guest_mode(vcpu))
 +			return kvm_handle_noslot_fault(vcpu, fault, access);
 +
 +		/*
 +		 * If the APIC access page exists but is disabled, go directly
 +		 * to emulation without caching the MMIO access or creating a
 +		 * MMIO SPTE.  That way the cache doesn't need to be purged
 +		 * when the AVIC is re-enabled.
 +		 */
 +		if (!kvm_apicv_activated(vcpu->kvm))
 +			return RET_PF_EMULATE;
 +	}
 +
  	fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
  	smp_rmb();
  
@@@ -4424,7 -4439,8 +4424,7 @@@
  	 * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
  	 * to detect retry guarantees the worst case latency for the vCPU.
  	 */
 -	if (fault->slot &&
 -	    mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
 +	if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
  		return RET_PF_RETRY;
  
  	ret = __kvm_faultin_pfn(vcpu, fault);
@@@ -4434,7 -4450,7 +4434,7 @@@
  	if (unlikely(is_error_pfn(fault->pfn)))
  		return kvm_handle_error_pfn(vcpu, fault);
  
 -	if (unlikely(!fault->slot))
 +	if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
  		return kvm_handle_noslot_fault(vcpu, fault, access);
  
  	/*
@@@ -4545,16 -4561,6 +4545,16 @@@ int kvm_handle_page_fault(struct kvm_vc
  	if (WARN_ON_ONCE(fault_address >> 32))
  		return -EFAULT;
  #endif
 +	/*
 +	 * Legacy #PF exception only have a 32-bit error code.  Simply drop the
 +	 * upper bits as KVM doesn't use them for #PF (because they are never
 +	 * set), and to ensure there are no collisions with KVM-defined bits.
 +	 */
 +	if (WARN_ON_ONCE(error_code >> 32))
 +		error_code = lower_32_bits(error_code);
 +
 +	/* Ensure the above sanity check also covers KVM-defined flags. */
 +	BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
  
  	vcpu->arch.l1tf_flush_l1d = true;
  	if (!flags) {
@@@ -4806,7 -4812,7 +4806,7 @@@ EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd)
  static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
  			   unsigned int access)
  {
 -	if (unlikely(is_mmio_spte(*sptep))) {
 +	if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
  		if (gfn != get_mmio_spte_gfn(*sptep)) {
  			mmu_spte_clear_no_track(sptep);
  			return true;
@@@ -4929,7 -4935,7 +4929,7 @@@ static void reset_guest_rsvds_bits_mask
  				context->cpu_role.base.level, is_efer_nx(context),
  				guest_can_use(vcpu, X86_FEATURE_GBPAGES),
  				is_cr4_pse(context),
 -				guest_cpuid_is_amd_or_hygon(vcpu));
 +				guest_cpuid_is_amd_compatible(vcpu));
  }
  
  static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
@@@ -5316,6 -5322,11 +5316,11 @@@ static inline int kvm_mmu_get_tdp_level
  	return max_tdp_level;
  }
  
+ u8 kvm_mmu_get_max_tdp_level(void)
+ {
+ 	return tdp_root_level ? tdp_root_level : max_tdp_level;
+ }
+ 
  static union kvm_mmu_page_role
  kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
  				union kvm_cpu_role cpu_role)
@@@ -5570,9 -5581,9 +5575,9 @@@ void kvm_mmu_after_set_cpuid(struct kvm
  	 * that problem is swept under the rug; KVM's CPUID API is horrific and
  	 * it's all but impossible to solve it without introducing a new API.
  	 */
 -	vcpu->arch.root_mmu.root_role.word = 0;
 -	vcpu->arch.guest_mmu.root_role.word = 0;
 -	vcpu->arch.nested_mmu.root_role.word = 0;
 +	vcpu->arch.root_mmu.root_role.invalid = 1;
 +	vcpu->arch.guest_mmu.root_role.invalid = 1;
 +	vcpu->arch.nested_mmu.root_role.invalid = 1;
  	vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
  	vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
  	vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
@@@ -5796,15 -5807,10 +5801,15 @@@ void kvm_mmu_track_write(struct kvm_vcp
  	bool flush = false;
  
  	/*
 -	 * If we don't have indirect shadow pages, it means no page is
 -	 * write-protected, so we can exit simply.
 +	 * When emulating guest writes, ensure the written value is visible to
 +	 * any task that is handling page faults before checking whether or not
 +	 * KVM is shadowing a guest PTE.  This ensures either KVM will create
 +	 * the correct SPTE in the page fault handler, or this task will see
 +	 * a non-zero indirect_shadow_pages.  Pairs with the smp_mb() in
 +	 * account_shadowed().
  	 */
 -	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
 +	smp_mb();
 +	if (!vcpu->kvm->arch.indirect_shadow_pages)
  		return;
  
  	write_lock(&vcpu->kvm->mmu_lock);
@@@ -5845,35 -5851,30 +5850,35 @@@ int noinline kvm_mmu_page_fault(struct 
  	int r, emulation_type = EMULTYPE_PF;
  	bool direct = vcpu->arch.mmu->root_role.direct;
  
 -	/*
 -	 * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
 -	 * checks when emulating instructions that triggers implicit access.
 -	 * WARN if hardware generates a fault with an error code that collides
 -	 * with the KVM-defined value.  Clear the flag and continue on, i.e.
 -	 * don't terminate the VM, as KVM can't possibly be relying on a flag
 -	 * that KVM doesn't know about.
 -	 */
 -	if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
 -		error_code &= ~PFERR_IMPLICIT_ACCESS;
 -
  	if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
  		return RET_PF_RETRY;
  
 +	/*
 +	 * Except for reserved faults (emulated MMIO is shared-only), set the
 +	 * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
 +	 * current attributes, which are the source of truth for such VMs.  Note,
 +	 * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
 +	 * currently supported nested virtualization (among many other things)
 +	 * for software-protected VMs.
 +	 */
 +	if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
 +	    !(error_code & PFERR_RSVD_MASK) &&
 +	    vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
 +	    kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
 +		error_code |= PFERR_PRIVATE_ACCESS;
 +
  	r = RET_PF_INVALID;
  	if (unlikely(error_code & PFERR_RSVD_MASK)) {
 +		if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
 +			return -EFAULT;
 +
  		r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
  		if (r == RET_PF_EMULATE)
  			goto emulate;
  	}
  
  	if (r == RET_PF_INVALID) {
 -		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
 -					  lower_32_bits(error_code), false,
 +		r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
  					  &emulation_type);
  		if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
  			return -EIO;
@@@ -6177,10 -6178,7 +6182,10 @@@ int kvm_mmu_create(struct kvm_vcpu *vcp
  	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
  	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
  
 -	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
 +	vcpu->arch.mmu_shadow_page_cache.init_value =
 +		SHADOW_NONPRESENT_VALUE;
 +	if (!vcpu->arch.mmu_shadow_page_cache.init_value)
 +		vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
  
  	vcpu->arch.mmu = &vcpu->arch.root_mmu;
  	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@@ -6323,7 -6321,6 +6328,7 @@@ static bool kvm_has_zapped_obsolete_pag
  
  void kvm_mmu_init_vm(struct kvm *kvm)
  {
 +	kvm->arch.shadow_mmio_value = shadow_mmio_value;
  	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
  	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
  	INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
@@@ -7407,8 -7404,7 +7412,8 @@@ bool kvm_arch_post_set_memory_attribute
  			 * by the memslot, KVM can't use a hugepage due to the
  			 * misaligned address regardless of memory attributes.
  			 */
 -			if (gfn >= slot->base_gfn) {
 +			if (gfn >= slot->base_gfn &&
 +			    gfn + nr_pages <= slot->base_gfn + slot->npages) {
  				if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
  					hugepage_clear_mixed(slot, gfn, level);
  				else
diff --combined arch/x86/kvm/x86.c
index fda22b3800a1,95a86ee871ff..082ac6d95a3a
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -92,12 -92,9 +92,12 @@@
  #define MAX_IO_MSRS 256
  #define KVM_MAX_MCE_BANKS 32
  
 -struct kvm_caps kvm_caps __read_mostly = {
 -	.supported_mce_cap = MCG_CTL_P | MCG_SER_P,
 -};
 +/*
 + * Note, kvm_caps fields should *never* have default values, all fields must be
 + * recomputed from scratch during vendor module load, e.g. to account for a
 + * vendor module being reloaded with different module parameters.
 + */
 +struct kvm_caps kvm_caps __read_mostly;
  EXPORT_SYMBOL_GPL(kvm_caps);
  
  #define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
@@@ -1624,7 -1621,7 +1624,7 @@@ static bool kvm_is_immutable_feature_ms
  	 ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
  	 ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
  	 ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
 -	 ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR)
 +	 ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO)
  
  static u64 kvm_get_arch_capabilities(void)
  {
@@@ -2233,16 -2230,13 +2233,13 @@@ static int do_set_msr(struct kvm_vcpu *
  	/*
  	 * Disallow writes to immutable feature MSRs after KVM_RUN.  KVM does
  	 * not support modifying the guest vCPU model on the fly, e.g. changing
- 	 * the nVMX capabilities while L2 is running is nonsensical.  Ignore
+ 	 * the nVMX capabilities while L2 is running is nonsensical.  Allow
  	 * writes of the same value, e.g. to allow userspace to blindly stuff
  	 * all MSRs when emulating RESET.
  	 */
- 	if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) {
- 		if (do_get_msr(vcpu, index, &val) || *data != val)
- 			return -EINVAL;
- 
- 		return 0;
- 	}
+ 	if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) &&
+ 	    (do_get_msr(vcpu, index, &val) || *data != val))
+ 		return -EINVAL;
  
  	return kvm_set_msr_ignored_check(vcpu, index, *data, true);
  }
@@@ -3473,7 -3467,7 +3470,7 @@@ static bool is_mci_status_msr(u32 msr
  static bool can_set_mci_status(struct kvm_vcpu *vcpu)
  {
  	/* McStatusWrEn enabled? */
 -	if (guest_cpuid_is_amd_or_hygon(vcpu))
 +	if (guest_cpuid_is_amd_compatible(vcpu))
  		return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
  
  	return false;
@@@ -4632,7 -4626,9 +4629,7 @@@ static int kvm_ioctl_get_supported_hv_c
  
  static bool kvm_is_vm_type_supported(unsigned long type)
  {
 -	return type == KVM_X86_DEFAULT_VM ||
 -	       (type == KVM_X86_SW_PROTECTED_VM &&
 -		IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
 +	return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
  }
  
  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@@ -4833,7 -4829,9 +4830,7 @@@
  		r = kvm_caps.has_notify_vmexit;
  		break;
  	case KVM_CAP_VM_TYPES:
 -		r = BIT(KVM_X86_DEFAULT_VM);
 -		if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM))
 -			r |= BIT(KVM_X86_SW_PROTECTED_VM);
 +		r = kvm_caps.supported_vm_types;
  		break;
  	default:
  		break;
@@@ -4841,44 -4839,46 +4838,44 @@@
  	return r;
  }
  
 -static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
 -{
 -	void __user *uaddr = (void __user*)(unsigned long)attr->addr;
 -
 -	if ((u64)(unsigned long)uaddr != attr->addr)
 -		return ERR_PTR_USR(-EFAULT);
 -	return uaddr;
 -}
 -
 -static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
 +static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val)
  {
 -	u64 __user *uaddr = kvm_get_attr_addr(attr);
 -
 -	if (attr->group)
 +	if (attr->group) {
 +		if (kvm_x86_ops.dev_get_attr)
 +			return static_call(kvm_x86_dev_get_attr)(attr->group, attr->attr, val);
  		return -ENXIO;
 -
 -	if (IS_ERR(uaddr))
 -		return PTR_ERR(uaddr);
 +	}
  
  	switch (attr->attr) {
  	case KVM_X86_XCOMP_GUEST_SUPP:
 -		if (put_user(kvm_caps.supported_xcr0, uaddr))
 -			return -EFAULT;
 +		*val = kvm_caps.supported_xcr0;
  		return 0;
  	default:
  		return -ENXIO;
  	}
  }
  
 +static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
 +{
 +	u64 __user *uaddr = u64_to_user_ptr(attr->addr);
 +	int r;
 +	u64 val;
 +
 +	r = __kvm_x86_dev_get_attr(attr, &val);
 +	if (r < 0)
 +		return r;
 +
 +	if (put_user(val, uaddr))
 +		return -EFAULT;
 +
 +	return 0;
 +}
 +
  static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
  {
 -	if (attr->group)
 -		return -ENXIO;
 +	u64 val;
  
 -	switch (attr->attr) {
 -	case KVM_X86_XCOMP_GUEST_SUPP:
 -		return 0;
 -	default:
 -		return -ENXIO;
 -	}
 +	return __kvm_x86_dev_get_attr(attr, &val);
  }
  
  long kvm_arch_dev_ioctl(struct file *filp,
@@@ -5554,15 -5554,11 +5551,15 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
  	return 0;
  }
  
 -static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 -					     struct kvm_debugregs *dbgregs)
 +static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 +					    struct kvm_debugregs *dbgregs)
  {
  	unsigned int i;
  
 +	if (vcpu->kvm->arch.has_protected_state &&
 +	    vcpu->arch.guest_state_protected)
 +		return -EINVAL;
 +
  	memset(dbgregs, 0, sizeof(*dbgregs));
  
  	BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
@@@ -5571,7 -5567,6 +5568,7 @@@
  
  	dbgregs->dr6 = vcpu->arch.dr6;
  	dbgregs->dr7 = vcpu->arch.dr7;
 +	return 0;
  }
  
  static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@@ -5579,10 -5574,6 +5576,10 @@@
  {
  	unsigned int i;
  
 +	if (vcpu->kvm->arch.has_protected_state &&
 +	    vcpu->arch.guest_state_protected)
 +		return -EINVAL;
 +
  	if (dbgregs->flags)
  		return -EINVAL;
  
@@@ -5603,8 -5594,8 +5600,8 @@@
  }
  
  
 -static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
 -					  u8 *state, unsigned int size)
 +static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
 +					 u8 *state, unsigned int size)
  {
  	/*
  	 * Only copy state for features that are enabled for the guest.  The
@@@ -5622,25 -5613,24 +5619,25 @@@
  			     XFEATURE_MASK_FPSSE;
  
  	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
 -		return;
 +		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
  
  	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
  				       supported_xcr0, vcpu->arch.pkru);
 +	return 0;
  }
  
 -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 -					 struct kvm_xsave *guest_xsave)
 +static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 +					struct kvm_xsave *guest_xsave)
  {
 -	kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
 -				      sizeof(guest_xsave->region));
 +	return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
 +					     sizeof(guest_xsave->region));
  }
  
  static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
  					struct kvm_xsave *guest_xsave)
  {
  	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
 -		return 0;
 +		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
  
  	return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
  					      guest_xsave->region,
@@@ -5648,23 -5638,18 +5645,23 @@@
  					      &vcpu->arch.pkru);
  }
  
 -static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
 -					struct kvm_xcrs *guest_xcrs)
 +static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
 +				       struct kvm_xcrs *guest_xcrs)
  {
 +	if (vcpu->kvm->arch.has_protected_state &&
 +	    vcpu->arch.guest_state_protected)
 +		return -EINVAL;
 +
  	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
  		guest_xcrs->nr_xcrs = 0;
 -		return;
 +		return 0;
  	}
  
  	guest_xcrs->nr_xcrs = 1;
  	guest_xcrs->flags = 0;
  	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
  	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
 +	return 0;
  }
  
  static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
@@@ -5672,10 -5657,6 +5669,10 @@@
  {
  	int i, r = 0;
  
 +	if (vcpu->kvm->arch.has_protected_state &&
 +	    vcpu->arch.guest_state_protected)
 +		return -EINVAL;
 +
  	if (!boot_cpu_has(X86_FEATURE_XSAVE))
  		return -EINVAL;
  
@@@ -5728,9 -5709,12 +5725,9 @@@ static int kvm_arch_tsc_has_attr(struc
  static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
  				 struct kvm_device_attr *attr)
  {
 -	u64 __user *uaddr = kvm_get_attr_addr(attr);
 +	u64 __user *uaddr = u64_to_user_ptr(attr->addr);
  	int r;
  
 -	if (IS_ERR(uaddr))
 -		return PTR_ERR(uaddr);
 -
  	switch (attr->attr) {
  	case KVM_VCPU_TSC_OFFSET:
  		r = -EFAULT;
@@@ -5748,10 -5732,13 +5745,10 @@@
  static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
  				 struct kvm_device_attr *attr)
  {
 -	u64 __user *uaddr = kvm_get_attr_addr(attr);
 +	u64 __user *uaddr = u64_to_user_ptr(attr->addr);
  	struct kvm *kvm = vcpu->kvm;
  	int r;
  
 -	if (IS_ERR(uaddr))
 -		return PTR_ERR(uaddr);
 -
  	switch (attr->attr) {
  	case KVM_VCPU_TSC_OFFSET: {
  		u64 offset, tsc, ns;
@@@ -6058,9 -6045,7 +6055,9 @@@ long kvm_arch_vcpu_ioctl(struct file *f
  	case KVM_GET_DEBUGREGS: {
  		struct kvm_debugregs dbgregs;
  
 -		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
 +		r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
 +		if (r < 0)
 +			break;
  
  		r = -EFAULT;
  		if (copy_to_user(argp, &dbgregs,
@@@ -6090,9 -6075,7 +6087,9 @@@
  		if (!u.xsave)
  			break;
  
 -		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
 +		r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
 +		if (r < 0)
 +			break;
  
  		r = -EFAULT;
  		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
@@@ -6121,9 -6104,7 +6118,9 @@@
  		if (!u.xsave)
  			break;
  
 -		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
 +		r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
 +		if (r < 0)
 +			break;
  
  		r = -EFAULT;
  		if (copy_to_user(argp, u.xsave, size))
@@@ -6139,9 -6120,7 +6136,9 @@@
  		if (!u.xcrs)
  			break;
  
 -		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
 +		r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
 +		if (r < 0)
 +			break;
  
  		r = -EFAULT;
  		if (copy_to_user(argp, u.xcrs,
@@@ -6285,11 -6264,6 +6282,11 @@@
  	}
  #endif
  	case KVM_GET_SREGS2: {
 +		r = -EINVAL;
 +		if (vcpu->kvm->arch.has_protected_state &&
 +		    vcpu->arch.guest_state_protected)
 +			goto out;
 +
  		u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
  		r = -ENOMEM;
  		if (!u.sregs2)
@@@ -6302,11 -6276,6 +6299,11 @@@
  		break;
  	}
  	case KVM_SET_SREGS2: {
 +		r = -EINVAL;
 +		if (vcpu->kvm->arch.has_protected_state &&
 +		    vcpu->arch.guest_state_protected)
 +			goto out;
 +
  		u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
  		if (IS_ERR(u.sregs2)) {
  			r = PTR_ERR(u.sregs2);
@@@ -9760,8 -9729,6 +9757,8 @@@ int kvm_x86_vendor_init(struct kvm_x86_
  		return -EIO;
  	}
  
 +	memset(&kvm_caps, 0, sizeof(kvm_caps));
 +
  	x86_emulator_cache = kvm_alloc_emulator_cache();
  	if (!x86_emulator_cache) {
  		pr_err("failed to allocate cache for x86 emulator\n");
@@@ -9780,9 -9747,6 +9777,9 @@@
  	if (r)
  		goto out_free_percpu;
  
 +	kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
 +	kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
 +
  	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
  		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
  		kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
@@@ -9828,9 -9792,6 +9825,9 @@@
  
  	kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
  
 +	if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled)
 +		kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM);
 +
  	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
  		kvm_caps.supported_xss = 0;
  
@@@ -10031,15 -9992,12 +10028,12 @@@ static void set_or_clear_apicv_inhibit(
  
  static void kvm_apicv_init(struct kvm *kvm)
  {
- 	unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons;
+ 	enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT :
+ 						       APICV_INHIBIT_REASON_DISABLE;
  
- 	init_rwsem(&kvm->arch.apicv_update_lock);
- 
- 	set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true);
+ 	set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
  
- 	if (!enable_apicv)
- 		set_or_clear_apicv_inhibit(inhibits,
- 					   APICV_INHIBIT_REASON_DISABLE, true);
+ 	init_rwsem(&kvm->arch.apicv_update_lock);
  }
  
  static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
@@@ -10087,15 -10045,26 +10081,15 @@@ static int complete_hypercall_exit(stru
  	return kvm_skip_emulated_instruction(vcpu);
  }
  
 -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 +unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
 +				      unsigned long a0, unsigned long a1,
 +				      unsigned long a2, unsigned long a3,
 +				      int op_64_bit, int cpl)
  {
 -	unsigned long nr, a0, a1, a2, a3, ret;
 -	int op_64_bit;
 -
 -	if (kvm_xen_hypercall_enabled(vcpu->kvm))
 -		return kvm_xen_hypercall(vcpu);
 -
 -	if (kvm_hv_hypercall_enabled(vcpu))
 -		return kvm_hv_hypercall(vcpu);
 -
 -	nr = kvm_rax_read(vcpu);
 -	a0 = kvm_rbx_read(vcpu);
 -	a1 = kvm_rcx_read(vcpu);
 -	a2 = kvm_rdx_read(vcpu);
 -	a3 = kvm_rsi_read(vcpu);
 +	unsigned long ret;
  
  	trace_kvm_hypercall(nr, a0, a1, a2, a3);
  
 -	op_64_bit = is_64_bit_hypercall(vcpu);
  	if (!op_64_bit) {
  		nr &= 0xFFFFFFFF;
  		a0 &= 0xFFFFFFFF;
@@@ -10104,7 -10073,7 +10098,7 @@@
  		a3 &= 0xFFFFFFFF;
  	}
  
 -	if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
 +	if (cpl) {
  		ret = -KVM_EPERM;
  		goto out;
  	}
@@@ -10165,49 -10134,18 +10159,49 @@@
  
  		WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
  		vcpu->arch.complete_userspace_io = complete_hypercall_exit;
 +		/* stat is incremented on completion. */
  		return 0;
  	}
  	default:
  		ret = -KVM_ENOSYS;
  		break;
  	}
 +
  out:
 +	++vcpu->stat.hypercalls;
 +	return ret;
 +}
 +EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
 +
 +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 +{
 +	unsigned long nr, a0, a1, a2, a3, ret;
 +	int op_64_bit;
 +	int cpl;
 +
 +	if (kvm_xen_hypercall_enabled(vcpu->kvm))
 +		return kvm_xen_hypercall(vcpu);
 +
 +	if (kvm_hv_hypercall_enabled(vcpu))
 +		return kvm_hv_hypercall(vcpu);
 +
 +	nr = kvm_rax_read(vcpu);
 +	a0 = kvm_rbx_read(vcpu);
 +	a1 = kvm_rcx_read(vcpu);
 +	a2 = kvm_rdx_read(vcpu);
 +	a3 = kvm_rsi_read(vcpu);
 +	op_64_bit = is_64_bit_hypercall(vcpu);
 +	cpl = static_call(kvm_x86_get_cpl)(vcpu);
 +
 +	ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
 +	if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
 +		/* MAP_GPA tosses the request to the user space. */
 +		return 0;
 +
  	if (!op_64_bit)
  		ret = (u32)ret;
  	kvm_rax_write(vcpu, ret);
  
 -	++vcpu->stat.hypercalls;
  	return kvm_skip_emulated_instruction(vcpu);
  }
  EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@@ -11542,10 -11480,6 +11536,10 @@@ static void __get_regs(struct kvm_vcpu 
  
  int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
  {
 +	if (vcpu->kvm->arch.has_protected_state &&
 +	    vcpu->arch.guest_state_protected)
 +		return -EINVAL;
 +
  	vcpu_load(vcpu);
  	__get_regs(vcpu, regs);
  	vcpu_put(vcpu);
@@@ -11587,10 -11521,6 +11581,10 @@@ static void __set_regs(struct kvm_vcpu 
  
  int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
  {
 +	if (vcpu->kvm->arch.has_protected_state &&
 +	    vcpu->arch.guest_state_protected)
 +		return -EINVAL;
 +
  	vcpu_load(vcpu);
  	__set_regs(vcpu, regs);
  	vcpu_put(vcpu);
@@@ -11663,10 -11593,6 +11657,10 @@@ static void __get_sregs2(struct kvm_vcp
  int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
  				  struct kvm_sregs *sregs)
  {
 +	if (vcpu->kvm->arch.has_protected_state &&
 +	    vcpu->arch.guest_state_protected)
 +		return -EINVAL;
 +
  	vcpu_load(vcpu);
  	__get_sregs(vcpu, sregs);
  	vcpu_put(vcpu);
@@@ -11934,10 -11860,6 +11928,10 @@@ int kvm_arch_vcpu_ioctl_set_sregs(struc
  {
  	int ret;
  
 +	if (vcpu->kvm->arch.has_protected_state &&
 +	    vcpu->arch.guest_state_protected)
 +		return -EINVAL;
 +
  	vcpu_load(vcpu);
  	ret = __set_sregs(vcpu, sregs);
  	vcpu_put(vcpu);
@@@ -12055,7 -11977,7 +12049,7 @@@ int kvm_arch_vcpu_ioctl_get_fpu(struct 
  	struct fxregs_state *fxsave;
  
  	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
 -		return 0;
 +		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
  
  	vcpu_load(vcpu);
  
@@@ -12078,7 -12000,7 +12072,7 @@@ int kvm_arch_vcpu_ioctl_set_fpu(struct 
  	struct fxregs_state *fxsave;
  
  	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
 -		return 0;
 +		return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
  
  	vcpu_load(vcpu);
  
@@@ -12604,8 -12526,6 +12598,8 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
  		return -EINVAL;
  
  	kvm->arch.vm_type = type;
 +	kvm->arch.has_private_mem =
 +		(type == KVM_X86_SW_PROTECTED_VM);
  
  	ret = kvm_page_track_init(kvm);
  	if (ret)
@@@ -12805,7 -12725,7 +12799,7 @@@ static void memslot_rmap_free(struct kv
  	int i;
  
  	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- 		kvfree(slot->arch.rmap[i]);
+ 		vfree(slot->arch.rmap[i]);
  		slot->arch.rmap[i] = NULL;
  	}
  }
@@@ -12817,7 -12737,7 +12811,7 @@@ void kvm_arch_free_memslot(struct kvm *
  	memslot_rmap_free(slot);
  
  	for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
- 		kvfree(slot->arch.lpage_info[i - 1]);
+ 		vfree(slot->arch.lpage_info[i - 1]);
  		slot->arch.lpage_info[i - 1] = NULL;
  	}
  
@@@ -12909,7 -12829,7 +12903,7 @@@ out_free
  	memslot_rmap_free(slot);
  
  	for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
- 		kvfree(slot->arch.lpage_info[i - 1]);
+ 		vfree(slot->arch.lpage_info[i - 1]);
  		slot->arch.lpage_info[i - 1] = NULL;
  	}
  	return -ENOMEM;
diff --combined virt/kvm/kvm_main.c
index fb86ec20ebc4,711970d385f5..a1756d5077ee
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@@ -311,7 -311,8 +311,7 @@@ bool kvm_make_vcpus_request_mask(struc
  	return called;
  }
  
 -bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 -				      struct kvm_vcpu *except)
 +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
  {
  	struct kvm_vcpu *vcpu;
  	struct cpumask *cpus;
@@@ -324,14 -325,22 +324,14 @@@
  	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
  	cpumask_clear(cpus);
  
 -	kvm_for_each_vcpu(i, vcpu, kvm) {
 -		if (vcpu == except)
 -			continue;
 +	kvm_for_each_vcpu(i, vcpu, kvm)
  		kvm_make_vcpu_request(vcpu, req, cpus, me);
 -	}
  
  	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
  	put_cpu();
  
  	return called;
  }
 -
 -bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 -{
 -	return kvm_make_all_cpus_request_except(kvm, req, NULL);
 -}
  EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
  
  void kvm_flush_remote_tlbs(struct kvm *kvm)
@@@ -392,17 -401,12 +392,17 @@@ static void kvm_flush_shadow_all(struc
  static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
  					       gfp_t gfp_flags)
  {
 +	void *page;
 +
  	gfp_flags |= mc->gfp_zero;
  
  	if (mc->kmem_cache)
  		return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
 -	else
 -		return (void *)__get_free_page(gfp_flags);
 +
 +	page = (void *)__get_free_page(gfp_flags);
 +	if (page && mc->init_value)
 +		memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
 +	return page;
  }
  
  int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
@@@ -417,13 -421,6 +417,13 @@@
  		if (WARN_ON_ONCE(!capacity))
  			return -EIO;
  
 +		/*
 +		 * Custom init values can be used only for page allocations,
 +		 * and obviously conflict with __GFP_ZERO.
 +		 */
 +		if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
 +			return -EIO;
 +
  		mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
  		if (!mc->objects)
  			return -ENOMEM;
@@@ -586,6 -583,8 +586,6 @@@ static void kvm_null_fn(void
  }
  #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
  
 -static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
 -
  /* Iterate over each memslot intersecting [start, last] (inclusive) range */
  #define kvm_for_each_memslot_in_hva_range(node, slots, start, last)	     \
  	for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
@@@ -671,12 -670,14 +671,12 @@@ static __always_inline kvm_mn_ret_t __k
  static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
  						unsigned long start,
  						unsigned long end,
 -						union kvm_mmu_notifier_arg arg,
  						gfn_handler_t handler)
  {
  	struct kvm *kvm = mmu_notifier_to_kvm(mn);
  	const struct kvm_mmu_notifier_range range = {
  		.start		= start,
  		.end		= end,
 -		.arg		= arg,
  		.handler	= handler,
  		.on_lock	= (void *)kvm_null_fn,
  		.flush_on_ret	= true,
@@@ -704,6 -705,48 +704,6 @@@ static __always_inline int kvm_handle_h
  	return __kvm_handle_hva_range(kvm, &range).ret;
  }
  
 -static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 -{
 -	/*
 -	 * Skipping invalid memslots is correct if and only change_pte() is
 -	 * surrounded by invalidate_range_{start,end}(), which is currently
 -	 * guaranteed by the primary MMU.  If that ever changes, KVM needs to
 -	 * unmap the memslot instead of skipping the memslot to ensure that KVM
 -	 * doesn't hold references to the old PFN.
 -	 */
 -	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 -
 -	if (range->slot->flags & KVM_MEMSLOT_INVALID)
 -		return false;
 -
 -	return kvm_set_spte_gfn(kvm, range);
 -}
 -
 -static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 -					struct mm_struct *mm,
 -					unsigned long address,
 -					pte_t pte)
 -{
 -	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 -	const union kvm_mmu_notifier_arg arg = { .pte = pte };
 -
 -	trace_kvm_set_spte_hva(address);
 -
 -	/*
 -	 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
 -	 * If mmu_invalidate_in_progress is zero, then no in-progress
 -	 * invalidations, including this one, found a relevant memslot at
 -	 * start(); rechecking memslots here is unnecessary.  Note, a false
 -	 * positive (count elevated by a different invalidation) is sub-optimal
 -	 * but functionally ok.
 -	 */
 -	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 -	if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
 -		return;
 -
 -	kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
 -}
 -
  void kvm_mmu_invalidate_begin(struct kvm *kvm)
  {
  	lockdep_assert_held_write(&kvm->mmu_lock);
@@@ -789,7 -832,8 +789,7 @@@ static int kvm_mmu_notifier_invalidate_
  	 * mn_active_invalidate_count (see above) instead of
  	 * mmu_invalidate_in_progress.
  	 */
 -	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
 -					  hva_range.may_block);
 +	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
  
  	/*
  	 * If one or more memslots were found and thus zapped, notify arch code
@@@ -866,7 -910,8 +866,7 @@@ static int kvm_mmu_notifier_clear_flush
  {
  	trace_kvm_age_hva(start, end);
  
 -	return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG,
 -				    kvm_age_gfn);
 +	return kvm_handle_hva_range(mn, start, end, kvm_age_gfn);
  }
  
  static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@@ -919,6 -964,7 +919,6 @@@ static const struct mmu_notifier_ops kv
  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
  	.clear_young		= kvm_mmu_notifier_clear_young,
  	.test_young		= kvm_mmu_notifier_test_young,
 -	.change_pte		= kvm_mmu_notifier_change_pte,
  	.release		= kvm_mmu_notifier_release,
  };
  
@@@ -974,7 -1020,7 +974,7 @@@ static void kvm_destroy_dirty_bitmap(st
  	if (!memslot->dirty_bitmap)
  		return;
  
- 	kvfree(memslot->dirty_bitmap);
+ 	vfree(memslot->dirty_bitmap);
  	memslot->dirty_bitmap = NULL;
  }
  
@@@ -1283,12 -1329,6 +1283,12 @@@ static void kvm_destroy_devices(struct 
  	 * We do not need to take the kvm->lock here, because nobody else
  	 * has a reference to the struct kvm at this point and therefore
  	 * cannot access the devices list anyhow.
 +	 *
 +	 * The device list is generally managed as an rculist, but list_del()
 +	 * is used intentionally here. If a bug in KVM introduced a reader that
 +	 * was not backed by a reference on the kvm struct, the hope is that
 +	 * it'd consume the poisoned forward pointer instead of suffering a
 +	 * use-after-free, even though this cannot be guaranteed.
  	 */
  	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
  		list_del(&dev->vm_node);
@@@ -2923,7 -2963,7 +2923,7 @@@ out
  /*
   * Pin guest page in memory and return its pfn.
   * @addr: host virtual address which maps memory to the guest
 - * @atomic: whether this function can sleep
 + * @atomic: whether this function is forbidden from sleeping
   * @interruptible: whether the process can be interrupted by non-fatal signals
   * @async: whether this function need to wait IO complete if the
   *         host page is not in the memory
@@@ -2995,12 -3035,16 +2995,12 @@@ kvm_pfn_t __gfn_to_pfn_memslot(const st
  	if (hva)
  		*hva = addr;
  
 -	if (addr == KVM_HVA_ERR_RO_BAD) {
 -		if (writable)
 -			*writable = false;
 -		return KVM_PFN_ERR_RO_FAULT;
 -	}
 -
  	if (kvm_is_error_hva(addr)) {
  		if (writable)
  			*writable = false;
 -		return KVM_PFN_NOSLOT;
 +
 +		return addr == KVM_HVA_ERR_RO_BAD ? KVM_PFN_ERR_RO_FAULT :
 +						    KVM_PFN_NOSLOT;
  	}
  
  	/* Do not map writable pfn in the readonly memslot. */
@@@ -3264,7 -3308,6 +3264,7 @@@ static int next_segment(unsigned long l
  		return len;
  }
  
 +/* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
  static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
  				 void *data, int offset, int len)
  {
@@@ -3366,7 -3409,6 +3366,7 @@@ int kvm_vcpu_read_guest_atomic(struct k
  }
  EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
  
 +/* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
  static int __kvm_write_guest_page(struct kvm *kvm,
  				  struct kvm_memory_slot *memslot, gfn_t gfn,
  			          const void *data, int offset, int len)
@@@ -4683,8 -4725,7 +4683,8 @@@ static int kvm_device_release(struct in
  
  	if (dev->ops->release) {
  		mutex_lock(&kvm->lock);
 -		list_del(&dev->vm_node);
 +		list_del_rcu(&dev->vm_node);
 +		synchronize_rcu();
  		dev->ops->release(dev);
  		mutex_unlock(&kvm->lock);
  	}
@@@ -4767,7 -4808,7 +4767,7 @@@ static int kvm_ioctl_create_device(stru
  		kfree(dev);
  		return ret;
  	}
 -	list_add(&dev->vm_node, &kvm->devices);
 +	list_add_rcu(&dev->vm_node, &kvm->devices);
  	mutex_unlock(&kvm->lock);
  
  	if (ops->init)
@@@ -4778,8 -4819,7 +4778,8 @@@
  	if (ret < 0) {
  		kvm_put_kvm_no_destroy(kvm);
  		mutex_lock(&kvm->lock);
 -		list_del(&dev->vm_node);
 +		list_del_rcu(&dev->vm_node);
 +		synchronize_rcu();
  		if (ops->release)
  			ops->release(dev);
  		mutex_unlock(&kvm->lock);