cpufreq.off=1 [CPU_FREQ]
disable the cpufreq sub-system
+ cpufreq.default_governor=
+ [CPU_FREQ] Name of the default cpufreq governor or
+ policy to use. This governor must be registered in the
+ kernel before the cpufreq driver probes.
+
cpu_init_udelay=N
[X86] Delay for N microsec between assert and de-assert
of APIC INIT to start processors. This delay occurs
useful to also enable the page_owner functionality.
on: enable the feature
+ debugfs= [KNL] This parameter enables what is exposed to userspace
+ and debugfs internal clients.
+ Format: { on, no-mount, off }
+ on: All functions are enabled.
+ no-mount:
+ Filesystem is not registered but kernel clients can
+ access APIs and a crashkernel can be used to read
+ its content. There is nothing to mount.
+ off: Filesystem is not registered and clients
+ get a -EPERM as result when trying to register files
+ or directories within debugfs.
+ This is equivalent of the runtime functionality if
+ debugfs was not enabled in the kernel at all.
+ Default value is set in build-time with a kernel configuration.
+
debugpat [X86] Enable PAT debugging
decnet.addr= [HW,NET]
Format: {"off" | "on" | "skip[mbr]"}
efi= [EFI]
- Format: { "old_map", "nochunk", "noruntime", "debug",
- "nosoftreserve", "disable_early_pci_dma",
- "no_disable_early_pci_dma" }
- old_map [X86-64]: switch to the old ioremap-based EFI
- runtime services mapping. [Needs CONFIG_X86_UV=y]
+ Format: { "debug", "disable_early_pci_dma",
+ "nochunk", "noruntime", "nosoftreserve",
+ "novamap", "no_disable_early_pci_dma",
+ "old_map" }
+ debug: enable misc debug output.
+ disable_early_pci_dma: disable the busmaster bit on all
+ PCI bridges while in the EFI boot stub.
nochunk: disable reading files in "chunks" in the EFI
boot stub, as chunking can cause problems with some
firmware implementations.
noruntime : disable EFI runtime services support
- debug: enable misc debug output
nosoftreserve: The EFI_MEMORY_SP (Specific Purpose)
attribute may cause the kernel to reserve the
memory range for a memory mapping driver to
claim. Specify efi=nosoftreserve to disable this
reservation and treat the memory by its base type
(i.e. EFI_CONVENTIONAL_MEMORY / "System RAM").
- disable_early_pci_dma: Disable the busmaster bit on all
- PCI bridges while in the EFI boot stub
+ novamap: do not call SetVirtualAddressMap().
no_disable_early_pci_dma: Leave the busmaster bit set
on all PCI bridges while in the EFI boot stub
+ old_map [X86-64]: switch to the old ioremap-based EFI
+ runtime services mapping. [Needs CONFIG_X86_UV=y]
efi_no_storage_paranoia [EFI; X86]
Using this parameter you can use more than 50% of
touchscreen support is not enabled in the mainstream
kernel as of 2.6.30, a preliminary port can be found
in the "bleeding edge" mini2440 support kernel at
- http://repo.or.cz/w/linux-2.6/mini2440.git
+ https://repo.or.cz/w/linux-2.6/mini2440.git
mitigations=
[X86,PPC,S390,ARM64] Control optional mitigations for
no5lvl [X86-64] Disable 5-level paging mode. Forces
kernel to use 4-level paging instead.
+ nofsgsbase [X86] Disables FSGSBASE instructions.
+
no_console_suspend
[HW] Never suspend the console
Disable suspending of consoles during suspend and
latencies, which will choose a value aligned
with the appropriate hardware boundaries.
+ rcutree.rcu_min_cached_objs= [KNL]
+ Minimum number of objects which are cached and
+ maintained per one CPU. Object size is equal
+ to PAGE_SIZE. The cache allows to reduce the
+ pressure to page allocator, also it makes the
+ whole algorithm to behave better in low memory
+ condition.
+
rcutree.jiffies_till_first_fqs= [KNL]
Set delay from grace-period initialization to
first attempt to force quiescent states.
Set time (jiffies) between CPU-hotplug operations,
or zero to disable CPU-hotplug testing.
+ rcutorture.read_exit= [KNL]
+ Set the number of read-then-exit kthreads used
+ to test the interaction of RCU updaters and
+ task-exit processing.
+
+ rcutorture.read_exit_burst= [KNL]
+ The number of times in a given read-then-exit
+ episode that a set of read-then-exit kthreads
+ is spawned.
+
+ rcutorture.read_exit_delay= [KNL]
+ The delay, in seconds, between successive
+ read-then-exit testing episodes.
+
rcutorture.shuffle_interval= [KNL]
Set task-shuffle interval (s). Shuffling tasks
allows some CPUs to go into dyntick-idle mode
reboot_cpu is s[mp]#### with #### being the processor
to be used for rebooting.
+ refscale.holdoff= [KNL]
+ Set test-start holdoff period. The purpose of
+ this parameter is to delay the start of the
+ test until boot completes in order to avoid
+ interference.
+
+ refscale.loops= [KNL]
+ Set the number of loops over the synchronization
+ primitive under test. Increasing this number
+ reduces noise due to loop start/end overhead,
+ but the default has already reduced the per-pass
+ noise to a handful of picoseconds on ca. 2020
+ x86 laptops.
+
+ refscale.nreaders= [KNL]
+ Set number of readers. The default value of -1
+ selects N, where N is roughly 75% of the number
+ of CPUs. A value of zero is an interesting choice.
+
+ refscale.nruns= [KNL]
+ Set number of runs, each of which is dumped onto
+ the console log.
+
+ refscale.readdelay= [KNL]
+ Set the read-side critical-section duration,
+ measured in microseconds.
+
+ refscale.scale_type= [KNL]
+ Specify the read-protection implementation to test.
+
+ refscale.shutdown= [KNL]
+ Shut down the system at the end of the performance
+ test. This defaults to 1 (shut it down) when
+ rcuperf is built into the kernel and to 0 (leave
+ it running) when rcuperf is built as a module.
+
+ refscale.verbose= [KNL]
+ Enable additional printk() statements.
+
relax_domain_level=
[KNL, SMP] Set scheduler's default relax_domain_level.
See Documentation/admin-guide/cgroup-v1/cpusets.rst.
Prevent the CPU-hotplug component of torturing
until after init has spawned.
+ torture.ftrace_dump_at_shutdown= [KNL]
+ Dump the ftrace buffer at torture-test shutdown,
+ even if there were no errors. This can be a
+ very costly operation when many torture tests
+ are running concurrently, especially on systems
+ with rotating-rust storage.
+
tp720= [HW,PS2]
tpm_suspend_pcr=[HW,TPM]
panic() code such as dumping handler.
xen_nopvspin [X86,XEN]
- Disables the ticketlock slowpath using Xen PV
- optimizations.
+ Disables the qspinlock slowpath using Xen PV optimizations.
+ This parameter is obsoleted by "nopvspin" parameter, which
+ has equivalent effect for XEN platform.
xen_nopv [X86]
Disables the PV optimizations forcing the HVM guest to
as generic guest with no PV drivers. Currently support
XEN HVM, KVM, HYPER_V and VMWARE guest.
+ nopvspin [X86,XEN,KVM]
+ Disables the qspinlock slow path using PV optimizations
+ which allow the hypervisor to 'idle' the guest on lock
+ contention.
+
xirc2ps_cs= [NET,PCMCIA]
Format:
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
put their references to the VM's file descriptor.
Because a VM's resources are not freed until the last reference to its
-file descriptor is released, creating additional references to a VM via
+file descriptor is released, creating additional references to a VM
via fork(), dup(), etc... without careful consideration is strongly
discouraged and may have unwanted side effects, e.g. memory allocated
by and on behalf of the VM's process may not be freed/unaccounted when
========= ===================================
0 on success,
-EEXIST if an interrupt is already enqueued
- -EINVAL the the irq number is invalid
+ -EINVAL the irq number is invalid
-ENXIO if the PIC is in the kernel
-EFAULT if the pointer is invalid
========= ===================================
Defines the vcpu responses to the cpuid instruction. Applications
should use the KVM_SET_CPUID2 ioctl if available.
+ Note, when this IOCTL fails, KVM gives no guarantees that previous valid CPUID
+ configuration (if there is) is not corrupted. Userspace can get a copy of the
+ resulting CPUID configuration through KVM_GET_CPUID2 in case.
+
::
struct kvm_cpuid_entry {
:Capability: basic
:Architectures: arm, arm64
:Type: vm ioctl
-:Parameters: struct struct kvm_vcpu_init (out)
+:Parameters: struct kvm_vcpu_init (out)
:Returns: 0 on success; -1 on error
Errors:
The information returned by this ioctl can be used to prepare an instance
of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in
-in VCPU matching underlying host.
+VCPU matching underlying host.
4.84 KVM_GET_REG_LIST
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
struct kvm_vmx_nested_state_hdr {
- __u32 flags;
__u64 vmxon_pa;
__u64 vmcs12_pa;
- __u64 preemption_timer_deadline;
struct {
__u16 flags;
} smm;
+
+ __u32 flags;
+ __u64 preemption_timer_deadline;
};
struct kvm_vmx_nested_state_data {
/* KVM_EXIT_FAIL_ENTRY */
struct {
__u64 hardware_entry_failure_reason;
+ __u32 cpu; /* if KVM_LAST_CPU */
} fail_entry;
If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due
:Architectures: ppc
This capability, if KVM_CHECK_EXTENSION indicates that it is
-available, means that that the kernel has an implementation of the
+available, means that the kernel has an implementation of the
H_RANDOM hypercall backed by a hardware random-number generator.
If present, the kernel H_RANDOM handler can be enabled for guest use
with the KVM_CAP_PPC_ENABLE_HCALL capability.
:Architectures: x86
This capability, if KVM_CHECK_EXTENSION indicates that it is
-available, means that that the kernel has an implementation of the
+available, means that the kernel has an implementation of the
Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is
used to support Windows Hyper-V based guest paravirt drivers(VMBus).
:Architectures: ppc
This capability, if KVM_CHECK_EXTENSION indicates that it is
-available, means that that the kernel can support guests using the
+available, means that the kernel can support guests using the
radix MMU defined in Power ISA V3.00 (as implemented in the POWER9
processor).
:Architectures: ppc
This capability, if KVM_CHECK_EXTENSION indicates that it is
-available, means that that the kernel can support guests using the
+available, means that the kernel can support guests using the
hashed page table MMU defined in Power ISA V3.00 (as implemented in
the POWER9 processor), including in-memory segment tables.
If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a
number larger than 0 indicating the version of this capability is implemented
-and thereby which bits in in run->s.regs.device_irq_level can signal values.
+and thereby which bits in run->s.regs.device_irq_level can signal values.
Currently the following bits are defined for the device_irq_level bitmap::
bool return_nisv_io_abort_to_user;
};
- #define KVM_NR_MEM_OBJS 40
-
- /*
- * We don't want allocation failures within the mmu code, so we preallocate
- * enough memory for a single page fault in a cache.
- */
- struct kvm_mmu_memory_cache {
- int nobjs;
- void *objects[KVM_NR_MEM_OBJS];
- };
-
struct kvm_vcpu_fault_info {
u32 esr_el2; /* Hyp Syndrom Register */
u64 far_el2; /* Hyp Fault Address Register */
#define vcpu_has_sve(vcpu) (system_supports_sve() && \
((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE))
-#define vcpu_has_ptrauth(vcpu) ((system_supports_address_auth() || \
- system_supports_generic_auth()) && \
- ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_PTRAUTH))
+#ifdef CONFIG_ARM64_PTR_AUTH
+#define vcpu_has_ptrauth(vcpu) \
+ ((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || \
+ cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && \
+ (vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_PTRAUTH)
+#else
+#define vcpu_has_ptrauth(vcpu) false
+#endif
#define vcpu_gp_regs(v) (&(v)->arch.ctxt.gp_regs)
void force_vm_exit(const cpumask_t *mask);
void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
- int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
- int exception_index);
- void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run,
- int exception_index);
+ int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
+ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
/* MMIO helpers */
void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
- int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
- int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
- phys_addr_t fault_ipa);
+ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu);
+ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa);
int kvm_perf_init(void);
int kvm_perf_teardown(void);
put_page(virt_to_page(pudp));
}
- static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- int min, int max)
- {
- void *page;
-
- BUG_ON(max > KVM_NR_MEM_OBJS);
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < max) {
- page = (void *)__get_free_page(GFP_PGTABLE_USER);
- if (!page)
- return -ENOMEM;
- cache->objects[cache->nobjs++] = page;
- }
- return 0;
- }
-
- static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
- {
- while (mc->nobjs)
- free_page((unsigned long)mc->objects[--mc->nobjs]);
- }
-
- static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
- {
- void *p;
-
- BUG_ON(!mc || !mc->nobjs);
- p = mc->objects[--mc->nobjs];
- return p;
- }
-
static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
{
p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
if (stage2_pgd_none(kvm, *pgd)) {
if (!cache)
return NULL;
- p4d = mmu_memory_cache_alloc(cache);
+ p4d = kvm_mmu_memory_cache_alloc(cache);
stage2_pgd_populate(kvm, pgd, p4d);
get_page(virt_to_page(pgd));
}
if (stage2_p4d_none(kvm, *p4d)) {
if (!cache)
return NULL;
- pud = mmu_memory_cache_alloc(cache);
+ pud = kvm_mmu_memory_cache_alloc(cache);
stage2_p4d_populate(kvm, p4d, pud);
get_page(virt_to_page(p4d));
}
if (stage2_pud_none(kvm, *pud)) {
if (!cache)
return NULL;
- pmd = mmu_memory_cache_alloc(cache);
+ pmd = kvm_mmu_memory_cache_alloc(cache);
stage2_pud_populate(kvm, pud, pmd);
get_page(virt_to_page(pud));
}
return true;
}
-static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr, unsigned long sz)
{
pud_t *pudp;
pmd_t *pmdp;
return false;
if (pudp)
- return kvm_s2pud_exec(pudp);
+ return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
else if (pmdp)
- return kvm_s2pmd_exec(pmdp);
+ return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
else
- return kvm_s2pte_exec(ptep);
+ return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
}
static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
if (stage2_pud_none(kvm, *pud)) {
if (!cache)
return 0; /* ignore calls from kvm_set_spte_hva */
- pmd = mmu_memory_cache_alloc(cache);
+ pmd = kvm_mmu_memory_cache_alloc(cache);
stage2_pud_populate(kvm, pud, pmd);
get_page(virt_to_page(pud));
}
if (pmd_none(*pmd)) {
if (!cache)
return 0; /* ignore calls from kvm_set_spte_hva */
- pte = mmu_memory_cache_alloc(cache);
+ pte = kvm_mmu_memory_cache_alloc(cache);
kvm_pmd_populate(pmd, pte);
get_page(virt_to_page(pmd));
}
phys_addr_t addr, end;
int ret = 0;
unsigned long pfn;
- struct kvm_mmu_memory_cache cache = { 0, };
+ struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
pfn = __phys_to_pfn(pa);
if (writable)
pte = kvm_s2pte_mkwrite(pte);
- ret = mmu_topup_memory_cache(&cache,
- kvm_mmu_cache_min_pages(kvm),
- KVM_NR_MEM_OBJS);
+ ret = kvm_mmu_topup_memory_cache(&cache,
+ kvm_mmu_cache_min_pages(kvm));
if (ret)
goto out;
spin_lock(&kvm->mmu_lock);
}
out:
- mmu_free_memory_cache(&cache);
+ kvm_mmu_free_memory_cache(&cache);
return ret;
}
mmap_read_unlock(current->mm);
/* We need minimum second+third level pages */
- ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
- KVM_NR_MEM_OBJS);
+ ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
if (ret)
return ret;
* execute permissions, and we preserve whatever we have.
*/
needs_exec = exec_fault ||
- (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
+ (fault_status == FSC_PERM &&
+ stage2_is_exec(kvm, fault_ipa, vma_pagesize));
if (vma_pagesize == PUD_SIZE) {
pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
/**
* kvm_handle_guest_abort - handles all 2nd stage aborts
* @vcpu: the VCPU pointer
- * @run: the kvm_run structure
*
* Any abort that gets to the host is almost guaranteed to be caused by a
* missing second stage translation table entry, which can mean that either the
* space. The distinction is based on the IPA causing the fault and whether this
* memory region has been registered as standard RAM by user space.
*/
- int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
{
unsigned long fault_status;
phys_addr_t fault_ipa;
* of the page size.
*/
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
- ret = io_mem_abort(vcpu, run, fault_ipa);
+ ret = io_mem_abort(vcpu, fault_ipa);
goto out_unlock;
}
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
}
phys_addr_t kvm_mmu_get_httbr(void)
/* Macro to expand the AMU counter and type registers*/
#define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), access_amu }
-#define AMU_AMEVTYPE0_EL0(n) { SYS_DESC(SYS_AMEVTYPE0_EL0(n)), access_amu }
+#define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), access_amu }
#define AMU_AMEVCNTR1_EL0(n) { SYS_DESC(SYS_AMEVCNTR1_EL0(n)), access_amu }
-#define AMU_AMEVTYPE1_EL0(n) { SYS_DESC(SYS_AMEVTYPE1_EL0(n)), access_amu }
+#define AMU_AMEVTYPER1_EL0(n) { SYS_DESC(SYS_AMEVTYPER1_EL0(n)), access_amu }
static bool trap_ptrauth(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
AMU_AMEVCNTR0_EL0(13),
AMU_AMEVCNTR0_EL0(14),
AMU_AMEVCNTR0_EL0(15),
- AMU_AMEVTYPE0_EL0(0),
- AMU_AMEVTYPE0_EL0(1),
- AMU_AMEVTYPE0_EL0(2),
- AMU_AMEVTYPE0_EL0(3),
- AMU_AMEVTYPE0_EL0(4),
- AMU_AMEVTYPE0_EL0(5),
- AMU_AMEVTYPE0_EL0(6),
- AMU_AMEVTYPE0_EL0(7),
- AMU_AMEVTYPE0_EL0(8),
- AMU_AMEVTYPE0_EL0(9),
- AMU_AMEVTYPE0_EL0(10),
- AMU_AMEVTYPE0_EL0(11),
- AMU_AMEVTYPE0_EL0(12),
- AMU_AMEVTYPE0_EL0(13),
- AMU_AMEVTYPE0_EL0(14),
- AMU_AMEVTYPE0_EL0(15),
+ AMU_AMEVTYPER0_EL0(0),
+ AMU_AMEVTYPER0_EL0(1),
+ AMU_AMEVTYPER0_EL0(2),
+ AMU_AMEVTYPER0_EL0(3),
+ AMU_AMEVTYPER0_EL0(4),
+ AMU_AMEVTYPER0_EL0(5),
+ AMU_AMEVTYPER0_EL0(6),
+ AMU_AMEVTYPER0_EL0(7),
+ AMU_AMEVTYPER0_EL0(8),
+ AMU_AMEVTYPER0_EL0(9),
+ AMU_AMEVTYPER0_EL0(10),
+ AMU_AMEVTYPER0_EL0(11),
+ AMU_AMEVTYPER0_EL0(12),
+ AMU_AMEVTYPER0_EL0(13),
+ AMU_AMEVTYPER0_EL0(14),
+ AMU_AMEVTYPER0_EL0(15),
AMU_AMEVCNTR1_EL0(0),
AMU_AMEVCNTR1_EL0(1),
AMU_AMEVCNTR1_EL0(2),
AMU_AMEVCNTR1_EL0(13),
AMU_AMEVCNTR1_EL0(14),
AMU_AMEVCNTR1_EL0(15),
- AMU_AMEVTYPE1_EL0(0),
- AMU_AMEVTYPE1_EL0(1),
- AMU_AMEVTYPE1_EL0(2),
- AMU_AMEVTYPE1_EL0(3),
- AMU_AMEVTYPE1_EL0(4),
- AMU_AMEVTYPE1_EL0(5),
- AMU_AMEVTYPE1_EL0(6),
- AMU_AMEVTYPE1_EL0(7),
- AMU_AMEVTYPE1_EL0(8),
- AMU_AMEVTYPE1_EL0(9),
- AMU_AMEVTYPE1_EL0(10),
- AMU_AMEVTYPE1_EL0(11),
- AMU_AMEVTYPE1_EL0(12),
- AMU_AMEVTYPE1_EL0(13),
- AMU_AMEVTYPE1_EL0(14),
- AMU_AMEVTYPE1_EL0(15),
+ AMU_AMEVTYPER1_EL0(0),
+ AMU_AMEVTYPER1_EL0(1),
+ AMU_AMEVTYPER1_EL0(2),
+ AMU_AMEVTYPER1_EL0(3),
+ AMU_AMEVTYPER1_EL0(4),
+ AMU_AMEVTYPER1_EL0(5),
+ AMU_AMEVTYPER1_EL0(6),
+ AMU_AMEVTYPER1_EL0(7),
+ AMU_AMEVTYPER1_EL0(8),
+ AMU_AMEVTYPER1_EL0(9),
+ AMU_AMEVTYPER1_EL0(10),
+ AMU_AMEVTYPER1_EL0(11),
+ AMU_AMEVTYPER1_EL0(12),
+ AMU_AMEVTYPER1_EL0(13),
+ AMU_AMEVTYPER1_EL0(14),
+ AMU_AMEVTYPER1_EL0(15),
{ SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer },
{ SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer },
return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg);
}
- int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu)
{
kvm_inject_undefined(vcpu);
return 1;
return 1;
}
- int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu)
{
const struct sys_reg_desc *target_specific;
size_t num;
target_specific, num);
}
- int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu)
{
const struct sys_reg_desc *target_specific;
size_t num;
target_specific, num);
}
- int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_cp14_64(struct kvm_vcpu *vcpu)
{
return kvm_handle_cp_64(vcpu,
cp14_64_regs, ARRAY_SIZE(cp14_64_regs),
NULL, 0);
}
- int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_cp14_32(struct kvm_vcpu *vcpu)
{
return kvm_handle_cp_32(vcpu,
cp14_regs, ARRAY_SIZE(cp14_regs),
/**
* kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access
* @vcpu: The VCPU pointer
- * @run: The kvm_run struct
*/
- int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu)
{
struct sys_reg_params params;
unsigned long esr = kvm_vcpu_get_hsr(vcpu);
select HAVE_CBPF_JIT if !64BIT && !CPU_MICROMIPS
select HAVE_CONTEXT_TRACKING
select HAVE_TIF_NOHZ
- select HAVE_COPY_THREAD_TLS
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
select ARC_PROMLIB
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
+ select DMA_OPS
select FW_ARC
select FW_ARC32
select ARCH_MAY_HAVE_PC_FDC
select COMMON_CLK
select USE_OF
select BUILTIN_DTB
+ select PCI_HOST_GENERIC
help
This enables the support of Loongson-2/3 family of machines.
select SYS_SUPPORTS_NUMA
select SYS_SUPPORTS_SMP
select MIPS_L1_CACHE_SHIFT_7
+ select NUMA
help
This are the SGI Origin 200, Origin 2000 and Onyx 2 Graphics
workstations. To compile a Linux kernel that runs on these, say Y
This board is based on Netlogic XLP Processor.
Say Y here if you have a XLP based board.
-config MIPS_PARAVIRT
- bool "Para-Virtualized guest system"
- select CEVT_R4K
- select CSRC_R4K
- select SYS_SUPPORTS_64BIT_KERNEL
- select SYS_SUPPORTS_32BIT_KERNEL
- select SYS_SUPPORTS_BIG_ENDIAN
- select SYS_SUPPORTS_SMP
- select NR_CPUS_DEFAULT_4
- select SYS_HAS_EARLY_PRINTK
- select SYS_HAS_CPU_MIPS32_R2
- select SYS_HAS_CPU_MIPS64_R2
- select SYS_HAS_CPU_CAVIUM_OCTEON
- select HAVE_PCI
- select SWAP_IO_SPACE
- help
- This option supports guest running under ????
-
endchoice
source "arch/mips/alchemy/Kconfig"
source "arch/mips/loongson32/Kconfig"
source "arch/mips/loongson64/Kconfig"
source "arch/mips/netlogic/Kconfig"
-source "arch/mips/paravirt/Kconfig"
endmenu
config SYNC_R4K
bool
-config MIPS_MACHINE
- def_bool n
-
config NO_IOPORT_MAP
def_bool n
config KVM_GUEST
bool "KVM Guest Kernel"
+ depends on CPU_MIPS32_R2
depends on BROKEN_ON_SMP
help
Select this option if building a guest kernel for KVM (Trap & Emulate)
Y to "Enhanced Real Time Clock Support", below.
See also the SMP-HOWTO available at
- <http://www.tldp.org/docs.html#howto>.
+ <https://www.tldp.org/docs.html#howto>.
If you don't know what to do here, say N.
enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
u32 *opc, u32 cause,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
u32 cause,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
int r;
enum emulation_result er;
u32 rt;
+ struct kvm_run *run = vcpu->run;
void *data = run->mmio.data;
unsigned int imme;
unsigned long curr_pc;
vcpu->arch.gprs[rt], *(u32 *)data);
break;
+#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
case sdl_op:
run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
vcpu->arch.host_cp0_badvaddr) & (~0x7);
vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
vcpu->arch.gprs[rt], *(u64 *)data);
break;
+#endif
#ifdef CONFIG_CPU_LOONGSON64
case sdc2_op:
vcpu->arch.gprs[rt], *(u64 *)data);
break;
default:
- kvm_err("Godson Exteneded GS-Store not yet supported (inst=0x%08x)\n",
+ kvm_err("Godson Extended GS-Store not yet supported (inst=0x%08x)\n",
inst.word);
break;
}
}
enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
- u32 cause, struct kvm_run *run,
- struct kvm_vcpu *vcpu)
+ u32 cause, struct kvm_vcpu *vcpu)
{
+ struct kvm_run *run = vcpu->run;
int r;
enum emulation_result er;
unsigned long curr_pc;
}
break;
+#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
case ldl_op:
run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
vcpu->arch.host_cp0_badvaddr) & (~0x7);
break;
}
break;
+#endif
#ifdef CONFIG_CPU_LOONGSON64
case ldc2_op:
vcpu->mmio_needed = 30; /* signed */
break;
default:
- kvm_err("Godson Exteneded GS-Load for float not yet supported (inst=0x%08x)\n",
+ kvm_err("Godson Extended GS-Load for float not yet supported (inst=0x%08x)\n",
inst.word);
break;
}
run->mmio.phys_addr, run->mmio.len, run->mmio.data);
if (!r) {
- kvm_mips_complete_mmio_load(vcpu, run);
+ kvm_mips_complete_mmio_load(vcpu);
vcpu->mmio_needed = 0;
return EMULATE_DONE;
}
static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long),
unsigned long curr_pc,
unsigned long addr,
- struct kvm_run *run,
struct kvm_vcpu *vcpu,
u32 cause)
{
/* no matching guest TLB */
vcpu->arch.host_cp0_badvaddr = addr;
vcpu->arch.pc = curr_pc;
- kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, vcpu);
+ kvm_mips_emulate_tlbmiss_ld(cause, NULL, vcpu);
return EMULATE_EXCEPT;
case KVM_MIPS_TLBINV:
/* invalid matching guest TLB */
vcpu->arch.host_cp0_badvaddr = addr;
vcpu->arch.pc = curr_pc;
- kvm_mips_emulate_tlbinv_ld(cause, NULL, run, vcpu);
+ kvm_mips_emulate_tlbinv_ld(cause, NULL, vcpu);
return EMULATE_EXCEPT;
default:
break;
enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
u32 *opc, u32 cause,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DONE;
* guest's behalf.
*/
er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
- curr_pc, va, run, vcpu, cause);
+ curr_pc, va, vcpu, cause);
if (er != EMULATE_DONE)
goto done;
#ifdef CONFIG_KVM_MIPS_DYN_TRANS
} else if (op_inst == Hit_Invalidate_I) {
/* Perform the icache synchronisation on the guest's behalf */
er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
- curr_pc, va, run, vcpu, cause);
+ curr_pc, va, vcpu, cause);
if (er != EMULATE_DONE)
goto done;
er = kvm_mips_guest_cache_op(protected_flush_icache_line,
- curr_pc, va, run, vcpu, cause);
+ curr_pc, va, vcpu, cause);
if (er != EMULATE_DONE)
goto done;
}
enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
union mips_instruction inst;
switch (inst.r_format.opcode) {
case cop0_op:
- er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu);
+ er = kvm_mips_emulate_CP0(inst, opc, cause, vcpu);
break;
#ifndef CONFIG_CPU_MIPSR6
case cache_op:
++vcpu->stat.cache_exits;
trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
- er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu);
+ er = kvm_mips_emulate_cache(inst, opc, cause, vcpu);
break;
#else
case spec3_op:
case cache6_op:
++vcpu->stat.cache_exits;
trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
- er = kvm_mips_emulate_cache(inst, opc, cause, run,
+ er = kvm_mips_emulate_cache(inst, opc, cause,
vcpu);
break;
default:
enum emulation_result kvm_mips_emulate_syscall(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
}
enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
* branch target), and pass the RI exception to the guest OS.
*/
vcpu->arch.pc = curr_pc;
- return kvm_mips_emulate_ri_exc(cause, opc, run, vcpu);
+ return kvm_mips_emulate_ri_exc(cause, opc, vcpu);
}
- enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
- struct kvm_run *run)
+ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *run = vcpu->run;
unsigned long *gpr = &vcpu->arch.gprs[vcpu->arch.io_gpr];
enum emulation_result er = EMULATE_DONE;
static enum emulation_result kvm_mips_emulate_exc(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
enum emulation_result kvm_mips_check_privilege(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DONE;
}
if (er == EMULATE_PRIV_FAIL)
- kvm_mips_emulate_exc(cause, opc, run, vcpu);
+ kvm_mips_emulate_exc(cause, opc, vcpu);
return er;
}
*/
enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
u32 *opc,
- struct kvm_run *run,
struct kvm_vcpu *vcpu,
bool write_fault)
{
KVM_ENTRYHI_ASID));
if (index < 0) {
if (exccode == EXCCODE_TLBL) {
- er = kvm_mips_emulate_tlbmiss_ld(cause, opc, run, vcpu);
+ er = kvm_mips_emulate_tlbmiss_ld(cause, opc, vcpu);
} else if (exccode == EXCCODE_TLBS) {
- er = kvm_mips_emulate_tlbmiss_st(cause, opc, run, vcpu);
+ er = kvm_mips_emulate_tlbmiss_st(cause, opc, vcpu);
} else {
kvm_err("%s: invalid exc code: %d\n", __func__,
exccode);
*/
if (!TLB_IS_VALID(*tlb, va)) {
if (exccode == EXCCODE_TLBL) {
- er = kvm_mips_emulate_tlbinv_ld(cause, opc, run,
+ er = kvm_mips_emulate_tlbinv_ld(cause, opc,
vcpu);
} else if (exccode == EXCCODE_TLBS) {
- er = kvm_mips_emulate_tlbinv_st(cause, opc, run,
+ er = kvm_mips_emulate_tlbinv_st(cause, opc,
vcpu);
} else {
kvm_err("%s: invalid exc code: %d\n", __func__,
static inline unsigned int kvm_vz_config6_guest_wrmask(struct kvm_vcpu *vcpu)
{
- return MIPS_CONF6_LOONGSON_INTIMER | MIPS_CONF6_LOONGSON_EXTIMER;
+ return LOONGSON_CONF6_INTIMER | LOONGSON_CONF6_EXTIMER;
}
/*
static inline unsigned int kvm_vz_config6_user_wrmask(struct kvm_vcpu *vcpu)
{
return kvm_vz_config6_guest_wrmask(vcpu) |
- MIPS_CONF6_LOONGSON_SFBEN | MIPS_CONF6_LOONGSON_FTLBDIS;
+ LOONGSON_CONF6_SFBEN | LOONGSON_CONF6_FTLBDIS;
}
static gpa_t kvm_vz_gva_to_gpa_cb(gva_t gva)
static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
u32 *opc, u32 cause,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
static enum emulation_result kvm_vz_gpsi_cache(union mips_instruction inst,
u32 *opc, u32 cause,
- struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DONE;
{
enum emulation_result er = EMULATE_DONE;
struct kvm_vcpu_arch *arch = &vcpu->arch;
- struct kvm_run *run = vcpu->run;
union mips_instruction inst;
int rd, rt, sel;
int err;
switch (inst.r_format.opcode) {
case cop0_op:
- er = kvm_vz_gpsi_cop0(inst, opc, cause, run, vcpu);
+ er = kvm_vz_gpsi_cop0(inst, opc, cause, vcpu);
break;
#ifndef CONFIG_CPU_MIPSR6
case cache_op:
trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
- er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+ er = kvm_vz_gpsi_cache(inst, opc, cause, vcpu);
break;
#endif
#ifdef CONFIG_CPU_LOONGSON64
#ifdef CONFIG_CPU_MIPSR6
case cache6_op:
trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
- er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+ er = kvm_vz_gpsi_cache(inst, opc, cause, vcpu);
break;
#endif
case rdhwr_op:
*/
static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu)
{
- struct kvm_run *run = vcpu->run;
u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_FAIL;
int ret = RESUME_GUEST;
break;
case EMULATE_FAIL:
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
break;
*/
static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu)
{
- struct kvm_run *run = vcpu->run;
-
/*
* If MSA not present or not exposed to guest or FR=0, the MSA operation
* should have been treated as a reserved instruction!
(read_gc0_status() & (ST0_CU1 | ST0_FR)) == ST0_CU1 ||
!(read_gc0_config5() & MIPS_CONF5_MSAEN) ||
vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
return RESUME_HOST;
}
}
/* Treat as MMIO */
- er = kvm_mips_emulate_load(inst, cause, run, vcpu);
+ er = kvm_mips_emulate_load(inst, cause, vcpu);
if (er == EMULATE_FAIL) {
kvm_err("Guest Emulate Load from MMIO space failed: PC: %p, BadVaddr: %#lx\n",
opc, badvaddr);
}
/* Treat as MMIO */
- er = kvm_mips_emulate_store(inst, cause, run, vcpu);
+ er = kvm_mips_emulate_store(inst, cause, vcpu);
if (er == EMULATE_FAIL) {
kvm_err("Guest Emulate Store to MMIO space failed: PC: %p, BadVaddr: %#lx\n",
opc, badvaddr);
kvm_vz_flush_shadow_all(kvm);
}
- static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu)
+ static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu)
{
int cpu = smp_processor_id();
int preserve_guest_tlb;
kvm_vz_vcpu_load_wired(vcpu);
}
- static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+ static int kvm_vz_vcpu_run(struct kvm_vcpu *vcpu)
{
int cpu = smp_processor_id();
int r;
kvm_vz_vcpu_load_tlb(vcpu, cpu);
kvm_vz_vcpu_load_wired(vcpu);
- r = vcpu->arch.vcpu_run(run, vcpu);
+ r = vcpu->arch.vcpu_run(vcpu->run, vcpu);
kvm_vz_vcpu_save_wired(vcpu);
{
union diag318_info diag318_info = {
.cpnc = CPNC_LINUX,
- .cpvc_linux = 0,
- .cpvc_distro = {0},
+ .cpvc = 0,
};
if (!sclp.has_diag318)
if (IS_ENABLED(CONFIG_EXPOLINE_AUTO))
nospec_auto_detect();
+ jump_label_init();
parse_early_param();
#ifdef CONFIG_CRASH_DUMP
/* Deactivate elfcorehdr= kernel parameter */
free_mem_detect_info();
remove_oldmem();
- /*
- * Make sure all chunks are MAX_ORDER aligned so we don't need the
- * extra checks that HOLES_IN_ZONE would require.
- *
- * Is this still required?
- */
- memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT));
-
if (is_prot_virt_host())
setup_uv();
setup_memory_end();
select ARCH_HAS_FILTER_PGPROT
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_KCOV if X86_64
+ select ARCH_HAS_KCOV if X86_64 && STACK_VALIDATION
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select GENERIC_CPU_AUTOPROBE
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
+ select GENERIC_ENTRY
select GENERIC_FIND_FIRST_BIT
select GENERIC_IOMAP
select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
select HAVE_CONTEXT_TRACKING if X86_64
- select HAVE_COPY_THREAD_TLS
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_KERNEL_LZMA
select HAVE_KERNEL_LZO
select HAVE_KERNEL_XZ
+ select HAVE_KERNEL_ZSTD
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
select HAVE_FUNCTION_ERROR_INJECTION
depends on PARAVIRT
select PARAVIRT_CLOCK
select ARCH_CPUIDLE_HALTPOLL
+ select X86_HV_CALLBACK_VECTOR
default y
help
This option enables various optimizations for running under the KVM
config GART_IOMMU
bool "Old AMD GART IOMMU support"
+ select DMA_OPS
select IOMMU_HELPER
select SWIOTLB
depends on X86_64 && PCI && AMD_NB
bool "CPU microcode loading support"
default y
depends on CPU_SUP_AMD || CPU_SUP_INTEL
- select FW_LOADER
help
If you say Y here, you will be able to update the microcode on
Intel and AMD processors. The Intel support is for the IA32 family,
bool "Intel microcode loading support"
depends on MICROCODE
default MICROCODE
- select FW_LOADER
help
This options enables microcode patch loading support for Intel
processors.
config MICROCODE_AMD
bool "AMD microcode loading support"
depends on MICROCODE
- select FW_LOADER
help
If you select this option, microcode patch loading support for AMD
processors will be enabled.
#include <asm/trapnr.h>
#ifndef __ASSEMBLY__
+#include <linux/entry-common.h>
#include <linux/hardirq.h>
#include <asm/irq_stack.h>
-void idtentry_enter_user(struct pt_regs *regs);
-void idtentry_exit_user(struct pt_regs *regs);
-
-bool idtentry_enter_cond_rcu(struct pt_regs *regs);
-void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit);
+bool idtentry_enter_nmi(struct pt_regs *regs);
+void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state);
/**
* DECLARE_IDTENTRY - Declare functions for simple IDT entry points
* The macro is written so it acts as function definition. Append the
* body with a pair of curly brackets.
*
- * idtentry_enter() contains common code which has to be invoked before
- * arbitrary code in the body. idtentry_exit() contains common code
+ * irqentry_enter() contains common code which has to be invoked before
+ * arbitrary code in the body. irqentry_exit() contains common code
* which has to run before returning to the low level assembly code.
*/
#define DEFINE_IDTENTRY(func) \
\
__visible noinstr void func(struct pt_regs *regs) \
{ \
- bool rcu_exit = idtentry_enter_cond_rcu(regs); \
+ irqentry_state_t state = irqentry_enter(regs); \
\
instrumentation_begin(); \
__##func (regs); \
instrumentation_end(); \
- idtentry_exit_cond_rcu(regs, rcu_exit); \
+ irqentry_exit(regs, state); \
} \
\
static __always_inline void __##func(struct pt_regs *regs)
__visible noinstr void func(struct pt_regs *regs, \
unsigned long error_code) \
{ \
- bool rcu_exit = idtentry_enter_cond_rcu(regs); \
+ irqentry_state_t state = irqentry_enter(regs); \
\
instrumentation_begin(); \
__##func (regs, error_code); \
instrumentation_end(); \
- idtentry_exit_cond_rcu(regs, rcu_exit); \
+ irqentry_exit(regs, state); \
} \
\
static __always_inline void __##func(struct pt_regs *regs, \
* body with a pair of curly brackets.
*
* Contrary to DEFINE_IDTENTRY_ERRORCODE() this does not invoke the
- * idtentry_enter/exit() helpers before and after the body invocation. This
+ * irqentry_enter/exit() helpers before and after the body invocation. This
* needs to be done in the body itself if applicable. Use if extra work
* is required before the enter/exit() helpers are invoked.
*/
* to the function as error_code argument which needs to be truncated
* to an u8 because the push is sign extending.
*
- * On 64-bit idtentry_enter/exit() are invoked in the ASM entry code before
- * and after switching to the interrupt stack. On 32-bit this happens in C.
- *
* irq_enter/exit_rcu() are invoked before the function body and the
- * KVM L1D flush request is set.
+ * KVM L1D flush request is set. Stack switching to the interrupt stack
+ * has to be done in the function body if necessary.
*/
#define DEFINE_IDTENTRY_IRQ(func) \
static __always_inline void __##func(struct pt_regs *regs, u8 vector); \
__visible noinstr void func(struct pt_regs *regs, \
unsigned long error_code) \
{ \
- bool rcu_exit = idtentry_enter_cond_rcu(regs); \
+ irqentry_state_t state = irqentry_enter(regs); \
\
instrumentation_begin(); \
irq_enter_rcu(); \
__##func (regs, (u8)error_code); \
irq_exit_rcu(); \
instrumentation_end(); \
- idtentry_exit_cond_rcu(regs, rcu_exit); \
+ irqentry_exit(regs, state); \
} \
\
static __always_inline void __##func(struct pt_regs *regs, u8 vector)
* DEFINE_IDTENTRY_SYSVEC - Emit code for system vector IDT entry points
* @func: Function name of the entry point
*
- * idtentry_enter/exit() and irq_enter/exit_rcu() are invoked before the
+ * irqentry_enter/exit() and irq_enter/exit_rcu() are invoked before the
* function body. KVM L1D flush request is set.
*
* Runs the function on the interrupt stack if the entry hit kernel mode
\
__visible noinstr void func(struct pt_regs *regs) \
{ \
- bool rcu_exit = idtentry_enter_cond_rcu(regs); \
+ irqentry_state_t state = irqentry_enter(regs); \
\
instrumentation_begin(); \
irq_enter_rcu(); \
run_on_irqstack_cond(__##func, regs, regs); \
irq_exit_rcu(); \
instrumentation_end(); \
- idtentry_exit_cond_rcu(regs, rcu_exit); \
+ irqentry_exit(regs, state); \
} \
\
static noinline void __##func(struct pt_regs *regs)
\
__visible noinstr void func(struct pt_regs *regs) \
{ \
- bool rcu_exit = idtentry_enter_cond_rcu(regs); \
+ irqentry_state_t state = irqentry_enter(regs); \
\
instrumentation_begin(); \
__irq_enter_raw(); \
__##func (regs); \
__irq_exit_raw(); \
instrumentation_end(); \
- idtentry_exit_cond_rcu(regs, rcu_exit); \
+ irqentry_exit(regs, state); \
} \
\
static __always_inline void __##func(struct pt_regs *regs)
#else /* CONFIG_X86_64 */
-/* Maps to a regular IDTENTRY on 32bit for now */
-# define DECLARE_IDTENTRY_IST DECLARE_IDTENTRY
-# define DEFINE_IDTENTRY_IST DEFINE_IDTENTRY
-
/**
* DECLARE_IDTENTRY_DF - Declare functions for double fault 32bit variant
* @vector: Vector number (ignored for C)
#endif /* !CONFIG_X86_64 */
/* C-Code mapping */
+#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW
+#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW
+
+#ifdef CONFIG_X86_64
#define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST
#define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST
#define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST
-#define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW
-#define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW
-
#define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST
#define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST
#define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST
-
-/**
- * DECLARE_IDTENTRY_XEN - Declare functions for XEN redirect IDT entry points
- * @vector: Vector number (ignored for C)
- * @func: Function name of the entry point
- *
- * Used for xennmi and xendebug redirections. No DEFINE as this is all ASM
- * indirection magic.
- */
-#define DECLARE_IDTENTRY_XEN(vector, func) \
- asmlinkage void xen_asm_exc_xen##func(void); \
- asmlinkage void asm_exc_xen##func(void)
+#endif
#else /* !__ASSEMBLY__ */
# define DECLARE_IDTENTRY_MCE(vector, func) \
DECLARE_IDTENTRY(vector, func)
-# define DECLARE_IDTENTRY_DEBUG(vector, func) \
- DECLARE_IDTENTRY(vector, func)
-
/* No ASM emitted for DF as this goes through a C shim */
# define DECLARE_IDTENTRY_DF(vector, func)
/* No ASM code emitted for NMI */
#define DECLARE_IDTENTRY_NMI(vector, func)
-/* XEN NMI and DB wrapper */
-#define DECLARE_IDTENTRY_XEN(vector, func) \
- idtentry vector asm_exc_xen##func exc_##func has_error_code=0
-
/*
* ASM code to emit the common vector entry stubs where each stub is
* packed into 8 bytes.
.align 8
SYM_CODE_START(irq_entries_start)
vector=FIRST_EXTERNAL_VECTOR
- pos = .
.rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
UNWIND_HINT_IRET_REGS
+0 :
.byte 0x6a, vector
jmp asm_common_interrupt
nop
/* Ensure that the above is 8 bytes max */
- . = pos + 8
- pos=pos+8
- vector=vector+1
+ . = 0b + 8
+ vector = vector+1
.endr
SYM_CODE_END(irq_entries_start)
.align 8
SYM_CODE_START(spurious_entries_start)
vector=FIRST_SYSTEM_VECTOR
- pos = .
.rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
UNWIND_HINT_IRET_REGS
+0 :
.byte 0x6a, vector
jmp asm_spurious_interrupt
nop
/* Ensure that the above is 8 bytes max */
- . = pos + 8
- pos=pos+8
- vector=vector+1
+ . = 0b + 8
+ vector = vector+1
.endr
SYM_CODE_END(spurious_entries_start)
#endif
DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault);
#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_64
DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
+#else
+DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check);
+#endif
#endif
/* NMI */
DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
-DECLARE_IDTENTRY_XEN(X86_TRAP_NMI, nmi);
+#if defined(CONFIG_XEN_PV) && defined(CONFIG_X86_64)
+DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
+#endif
/* #DB */
+#ifdef CONFIG_X86_64
DECLARE_IDTENTRY_DEBUG(X86_TRAP_DB, exc_debug);
-DECLARE_IDTENTRY_XEN(X86_TRAP_DB, debug);
+#else
+DECLARE_IDTENTRY_RAW(X86_TRAP_DB, exc_debug);
+#endif
+#if defined(CONFIG_XEN_PV) && defined(CONFIG_X86_64)
+DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug);
+#endif
/* #DF */
DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
#if IS_ENABLED(CONFIG_HYPERV)
DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
-DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
-DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_STIMER0_VECTOR, sysvec_hyperv_stimer0);
+DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
+DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0);
#endif
#if IS_ENABLED(CONFIG_ACRN_GUEST)
DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback);
#endif
+ #ifdef CONFIG_KVM_GUEST
+ DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
+ #endif
+
#undef X86_TRAP_OTHER
#endif
*/
+ #define pr_fmt(fmt) "kvm-guest: " fmt
+
#include <linux/context_tracking.h>
#include <linux/init.h>
+ #include <linux/irq.h>
#include <linux/kernel.h>
#include <linux/kvm_para.h>
#include <linux/cpu.h>
noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
- u32 reason = kvm_read_and_reset_apf_flags();
+ u32 flags = kvm_read_and_reset_apf_flags();
- bool rcu_exit;
+ irqentry_state_t state;
- switch (reason) {
- case KVM_PV_REASON_PAGE_NOT_PRESENT:
- case KVM_PV_REASON_PAGE_READY:
- break;
- default:
+ if (!flags)
return false;
- }
- rcu_exit = idtentry_enter_cond_rcu(regs);
+ state = irqentry_enter(regs);
instrumentation_begin();
/*
if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
panic("Host injected async #PF in interrupt disabled region\n");
- if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
+ if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
if (unlikely(!(user_mode(regs))))
panic("Host injected async #PF in kernel mode\n");
/* Page is swapped out by the host. */
kvm_async_pf_task_wait_schedule(token);
} else {
- kvm_async_pf_task_wake(token);
+ WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
}
instrumentation_end();
- idtentry_exit_cond_rcu(regs, rcu_exit);
+ irqentry_exit(regs, state);
return true;
}
- bool rcu_exit;
+ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
+ {
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ u32 token;
- rcu_exit = idtentry_enter_cond_rcu(regs);
++ irqentry_state_t state;
+
- idtentry_exit_cond_rcu(regs, rcu_exit);
++ state = irqentry_enter(regs);
+
+ inc_irq_stat(irq_hv_callback_count);
+
+ if (__this_cpu_read(apf_reason.enabled)) {
+ token = __this_cpu_read(apf_reason.token);
+ kvm_async_pf_task_wake(token);
+ __this_cpu_write(apf_reason.token, 0);
+ wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
+ }
+
++ irqentry_exit(regs, state);
+ set_irq_regs(old_regs);
+ }
+
static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
return;
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
- pr_info("kvm-stealtime: cpu %d, msr %llx\n",
- cpu, (unsigned long long) slow_virt_to_phys(st));
+ pr_info("stealtime: cpu %d, msr %llx\n", cpu,
+ (unsigned long long) slow_virt_to_phys(st));
}
static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
static void kvm_guest_cpu_init(void)
{
- if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
- u64 pa;
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
+ u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
- pa |= KVM_ASYNC_PF_ENABLED;
+ pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+ wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
+
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1);
pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
} else {
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
- WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
+ WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
+ ret);
min = max = apic_id;
ipi_bitmap = 0;
}
if (ipi_bitmap) {
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
- WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
+ WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
+ ret);
}
local_irq_restore(flags);
{
apic->send_IPI_mask = kvm_send_ipi_mask;
apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
- pr_info("KVM setup pv IPIs\n");
+ pr_info("setup PV IPIs\n");
}
static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
}
}
- static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
- {
- native_smp_prepare_cpus(max_cpus);
- if (kvm_para_has_hint(KVM_HINTS_REALTIME))
- static_branch_disable(&virt_spin_lock_key);
- }
-
static void __init kvm_smp_prepare_boot_cpu(void)
{
/*
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
- if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf)
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
static_branch_enable(&kvm_async_pf_enabled);
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt);
+ }
#ifdef CONFIG_SMP
- smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
if (pv_sched_yield_supported()) {
smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
- pr_info("KVM setup pv sched yield\n");
+ pr_info("setup PV sched yield\n");
}
if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
kvm_cpu_online, kvm_cpu_down_prepare) < 0)
- pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
+ pr_err("failed to install cpu hotplug callbacks\n");
#else
sev_map_percpu_data();
kvm_guest_cpu_init();
*/
void __init kvm_spinlock_init(void)
{
- /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
- if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+ /*
+ * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
+ * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
+ * preferred over native qspinlock when vCPU is preempted.
+ */
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
+ pr_info("PV spinlocks disabled, no host support\n");
return;
+ }
- if (kvm_para_has_hint(KVM_HINTS_REALTIME))
- return;
+ /*
+ * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
+ * are available.
+ */
+ if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
+ pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
+ goto out;
+ }
- /* Don't use the pvqspinlock code if there is only 1 vCPU. */
- if (num_possible_cpus() == 1)
- return;
+ if (num_possible_cpus() == 1) {
+ pr_info("PV spinlocks disabled, single CPU\n");
+ goto out;
+ }
+
+ if (nopvspin) {
+ pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
+ goto out;
+ }
+
+ pr_info("PV spinlocks enabled\n");
__pv_init_lock_hash();
pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
pv_ops.lock.vcpu_is_preempted =
PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
}
+ /*
+ * When PV spinlock is enabled which is preferred over
+ * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
+ * Just disable it anyway.
+ */
+ out:
+ static_branch_disable(&virt_spin_lock_key);
}
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
void arch_haltpoll_enable(unsigned int cpu)
{
if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
- pr_err_once("kvm: host does not support poll control\n");
- pr_err_once("kvm: host upgrade recommended\n");
+ pr_err_once("host does not support poll control\n");
+ pr_err_once("host upgrade recommended\n");
return;
}
void kvm_apic_set_version(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- struct kvm_cpuid_entry2 *feat;
u32 v = APIC_VERSION;
if (!lapic_in_kernel(vcpu))
* version first and level-triggered interrupts never get EOIed in
* IOAPIC.
*/
- feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
- if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) &&
+ if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) &&
!ioapic_in_kernel(vcpu->kvm))
v |= APIC_LVR_DIRECTED_EOI;
kvm_lapic_set_reg(apic, APIC_LVR, v);
case APIC_TDCR: {
uint32_t old_divisor = apic->divide_count;
- kvm_lapic_set_reg(apic, APIC_TDCR, val);
+ kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb);
update_divide_count(apic);
if (apic->divide_count != old_divisor &&
apic->lapic_timer.period) {
case APIC_SELF_IPI:
if (apic_x2apic_mode(apic)) {
- kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
+ kvm_lapic_reg_write(apic, APIC_ICR,
+ APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
} else
ret = 1;
break;
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
+ if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
apic_lvtt_period(apic))
return;
vcpu->arch.apic_base = value;
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
if (!apic)
return;
#include "irq.h"
#include "ioapic.h"
#include "mmu.h"
+ #include "mmu_internal.h"
#include "x86.h"
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
*/
bool tdp_enabled = false;
- static int max_page_level __read_mostly;
+ static int max_huge_page_level __read_mostly;
+ static int max_tdp_level __read_mostly;
enum {
AUDIT_PRE_PAGE_FAULT,
return likely(kvm_gen == spte_gen);
}
+ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception)
+ {
+ /* Check if guest physical address doesn't exceed guest maximum */
+ if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) {
+ exception->error_code |= PFERR_RSVD_MASK;
+ return UNMAPPED_GVA;
+ }
+
+ return gpa;
+ }
+
/*
* Sets the shadow PTE masks used by the MMU.
*
static void count_spte_clear(u64 *sptep, u64 spte)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
if (is_shadow_present_pte(spte))
return;
*/
static u64 __get_spte_lockless(u64 *sptep)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
union split_spte spte, *orig = (union split_spte *)sptep;
int count;
local_irq_enable();
}
- static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min)
- {
- void *obj;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
- if (!obj)
- return cache->nobjs >= min ? 0 : -ENOMEM;
- cache->objects[cache->nobjs++] = obj;
- }
- return 0;
- }
-
- static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
- {
- return cache->nobjs;
- }
-
- static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
- struct kmem_cache *cache)
- {
- while (mc->nobjs)
- kmem_cache_free(cache, mc->objects[--mc->nobjs]);
- }
-
- static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
- int min)
- {
- void *page;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
- if (!page)
- return cache->nobjs >= min ? 0 : -ENOMEM;
- cache->objects[cache->nobjs++] = page;
- }
- return 0;
- }
-
- static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
- {
- while (mc->nobjs)
- free_page((unsigned long)mc->objects[--mc->nobjs]);
- }
-
- static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
{
int r;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
- pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
+ /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
if (r)
- goto out;
- r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+ return r;
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+ PT64_ROOT_MAX_LEVEL);
if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache, 4);
- out:
- return r;
+ return r;
+ if (maybe_indirect) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
+ return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ PT64_ROOT_MAX_LEVEL);
}
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
- pte_list_desc_cache);
- mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache);
- }
-
- static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
- {
- void *p;
-
- BUG_ON(!mc->nobjs);
- p = mc->objects[--mc->nobjs];
- return p;
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
+ return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
}
static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
static bool rmap_can_add(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu_memory_cache *cache;
+ struct kvm_mmu_memory_cache *mc;
- cache = &vcpu->arch.mmu_pte_list_desc_cache;
- return mmu_memory_cache_free_objects(cache);
+ mc = &vcpu->arch.mmu_pte_list_desc_cache;
+ return kvm_mmu_memory_cache_nr_free_objects(mc);
}
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
struct kvm_mmu_page *sp;
struct kvm_rmap_head *rmap_head;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
return pte_list_add(vcpu, spte, rmap_head);
gfn_t gfn;
struct kvm_rmap_head *rmap_head;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmap_head = gfn_to_rmap(kvm, gfn, sp);
__pte_list_remove(spte, rmap_head);
static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
{
if (is_large_pte(*sptep)) {
- WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K);
+ WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
drop_spte(kvm, sptep);
--kvm->stat.lpages;
return true;
static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
{
if (__drop_large_spte(vcpu->kvm, sptep)) {
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
KVM_PAGES_PER_HPAGE(sp->role.level));
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
- /**
- * kvm_arch_write_log_dirty - emulate dirty page logging
- * @vcpu: Guest mode vcpu
- *
- * Emulate arch specific page modification logging for the
- * nested hypervisor
- */
- int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
- {
- if (kvm_x86_ops.write_log_dirty)
- return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa);
-
- return 0;
- }
-
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn)
{
unsigned long data)
{
u64 *sptep;
- struct rmap_iterator uninitialized_var(iter);
+ struct rmap_iterator iter;
int young = 0;
for_each_rmap_spte(rmap_head, &iter, sptep)
struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
{
struct kvm_mmu_page *sp;
- sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
- sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+ sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
if (!direct)
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+ sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
/*
struct kvm_mmu_page *sp;
unsigned int index;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
index = spte - sp->spt;
if (__test_and_set_bit(index, sp->unsync_child_bitmap))
return;
continue;
}
- child = page_header(ent & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
if (child->unsync_children) {
if (mmu_pages_add(pvec, child, i))
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
-
- #define for_each_valid_sp(_kvm, _sp, _gfn) \
- hlist_for_each_entry(_sp, \
- &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+ #define for_each_valid_sp(_kvm, _sp, _list) \
+ hlist_for_each_entry(_sp, _list, hash_link) \
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
- for_each_valid_sp(_kvm, _sp, _gfn) \
+ for_each_valid_sp(_kvm, _sp, \
+ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
static inline bool is_ept_sp(struct kvm_mmu_page *sp)
static void clear_sp_write_flooding_count(u64 *spte)
{
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- __clear_sp_write_flooding_count(sp);
+ __clear_sp_write_flooding_count(sptep_to_sp(spte));
}
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
int direct,
unsigned int access)
{
+ bool direct_mmu = vcpu->arch.mmu->direct_map;
union kvm_mmu_page_role role;
+ struct hlist_head *sp_list;
unsigned quadrant;
struct kvm_mmu_page *sp;
bool need_sync = false;
if (role.direct)
role.gpte_is_8_bytes = true;
role.access = access;
- if (!vcpu->arch.mmu->direct_map
- && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
+ if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- for_each_valid_sp(vcpu->kvm, sp, gfn) {
+
+ sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
+ for_each_valid_sp(vcpu->kvm, sp, sp_list) {
if (sp->gfn != gfn) {
collisions++;
continue;
if (sp->role.word != role.word)
continue;
+ if (direct_mmu)
+ goto trace_get_page;
+
if (sp->unsync) {
/* The page is good, but __kvm_sync_page might still end
* up zapping it. If so, break in order to rebuild it.
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
__clear_sp_write_flooding_count(sp);
+
+ trace_get_page:
trace_kvm_mmu_get_page(sp, false);
goto out;
}
sp->gfn = gfn;
sp->role = role;
- hlist_add_head(&sp->hash_link,
- &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
+ hlist_add_head(&sp->hash_link, sp_list);
if (!direct) {
/*
* we should do write protection before syncing pages
if (level > PG_LEVEL_4K && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
- clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
* so we should update the spte at this point to get
* a new sp with the correct access.
*/
- child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
if (child->role.access == direct_access)
return;
if (is_large_pte(pte))
--kvm->stat.lpages;
} else {
- child = page_header(pte & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
return true;
if (!sp->root_count) {
/* Count self */
(*nr_zapped)++;
- list_move(&sp->link, invalid_list);
+
+ /*
+ * Already invalid pages (previously active roots) are not on
+ * the active page list. See list_del() in the "else" case of
+ * !sp->root_count.
+ */
+ if (sp->role.invalid)
+ list_add(&sp->link, invalid_list);
+ else
+ list_move(&sp->link, invalid_list);
kvm_mod_used_mmu_pages(kvm, -1);
} else {
- list_move(&sp->link, &kvm->arch.active_mmu_pages);
+ /*
+ * Remove the active root from the active page list, the root
+ * will be explicitly freed when the root_count hits zero.
+ */
+ list_del(&sp->link);
/*
* Obsolete pages cannot be used on any vCPUs, see the comment
}
}
- static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
- struct list_head *invalid_list)
+ static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
+ unsigned long nr_to_zap)
{
- struct kvm_mmu_page *sp;
+ unsigned long total_zapped = 0;
+ struct kvm_mmu_page *sp, *tmp;
+ LIST_HEAD(invalid_list);
+ bool unstable;
+ int nr_zapped;
if (list_empty(&kvm->arch.active_mmu_pages))
- return false;
+ return 0;
+
+ restart:
+ list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+ /*
+ * Don't zap active root pages, the page itself can't be freed
+ * and zapping it will just force vCPUs to realloc and reload.
+ */
+ if (sp->root_count)
+ continue;
+
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
+ &nr_zapped);
+ total_zapped += nr_zapped;
+ if (total_zapped >= nr_to_zap)
+ break;
+
+ if (unstable)
+ goto restart;
+ }
- sp = list_last_entry(&kvm->arch.active_mmu_pages,
- struct kvm_mmu_page, link);
- return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ kvm->stat.mmu_recycled += total_zapped;
+ return total_zapped;
+ }
+
+ static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
+ {
+ if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+ return kvm->arch.n_max_mmu_pages -
+ kvm->arch.n_used_mmu_pages;
+
+ return 0;
}
static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
{
- LIST_HEAD(invalid_list);
+ unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
- if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+ if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
return 0;
- while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
- if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
- break;
-
- ++vcpu->kvm->stat.mmu_recycled;
- }
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
if (!kvm_mmu_available_pages(vcpu->kvm))
return -ENOSPC;
*/
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
- LIST_HEAD(invalid_list);
-
spin_lock(&kvm->mmu_lock);
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
- /* Need to free some mmu pages to achieve the goal. */
- while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
- if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
- break;
+ kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
+ goal_nr_mmu_pages);
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
if (sp_ad_disabled(sp))
spte |= SPTE_AD_DISABLED_MASK;
else if (kvm_vcpu_ad_need_write_protect(vcpu))
struct kvm_mmu_page *child;
u64 pte = *sptep;
- child = page_header(pte & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
{
struct kvm_mmu_page *sp;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
/*
* Without accessed bits, there's no way to distinguish between
if (!slot)
return PG_LEVEL_4K;
- max_level = min(max_level, max_page_level);
+ max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
linfo = lpage_info_slot(gfn, slot, max_level);
if (!linfo->disallow_lpage)
if (!is_shadow_present_pte(spte))
break;
- sp = page_header(__pa(iterator.sptep));
+ sp = sptep_to_sp(iterator.sptep);
if (!is_last_spte(spte, sp->role.level))
break;
if (!VALID_PAGE(*root_hpa))
return;
- sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
+ sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
--sp->root_count;
if (!sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
{
int ret = 0;
- if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+ if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
ret = 1;
}
if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu->root_hpa;
- sp = page_header(root);
+ sp = to_shadow_page(root);
/*
* Even if another CPU was marking the SP as unsync-ed
if (root && VALID_PAGE(root)) {
root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
+ sp = to_shadow_page(root);
mmu_sync_children(vcpu, sp);
}
}
walk_shadow_page_lockless_end(vcpu);
}
- static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- gfn_t gfn)
+ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ gfn_t gfn)
{
struct kvm_arch_async_pf arch;
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
- r = mmu_topup_memory_caches(vcpu);
+ if (fast_page_fault(vcpu, gpa, error_code))
+ return RET_PF_RETRY;
+
+ r = mmu_topup_memory_caches(vcpu, false);
if (r)
return r;
if (lpage_disallowed)
max_level = PG_LEVEL_4K;
- if (fast_page_fault(vcpu, gpa, error_code))
- return RET_PF_RETRY;
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
- if (make_mmu_pages_available(vcpu) < 0)
+ r = make_mmu_pages_available(vcpu);
+ if (r)
goto out_unlock;
r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
prefault, is_tdp && lpage_disallowed);
u64 fault_address, char *insn, int insn_len)
{
int r = 1;
+ u32 flags = vcpu->arch.apf.host_apf_flags;
#ifndef CONFIG_X86_64
/* A 64-bit CR2 should be impossible on 32-bit KVM. */
#endif
vcpu->arch.l1tf_flush_l1d = true;
- switch (vcpu->arch.apf.host_apf_flags) {
- default:
+ if (!flags) {
trace_kvm_page_fault(fault_address, error_code);
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
- break;
- case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
vcpu->arch.apf.host_apf_flags = 0;
local_irq_disable();
kvm_async_pf_task_wait_schedule(fault_address);
local_irq_enable();
- break;
- case KVM_PV_REASON_PAGE_READY:
- vcpu->arch.apf.host_apf_flags = 0;
- local_irq_disable();
- kvm_async_pf_task_wake(fault_address);
- local_irq_enable();
- break;
+ } else {
+ WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
}
+
return r;
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
union kvm_mmu_page_role role)
{
return (role.direct || pgd == root->pgd) &&
- VALID_PAGE(root->hpa) && page_header(root->hpa) &&
- role.word == page_header(root->hpa)->role.word;
+ VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
+ role.word == to_shadow_page(root->hpa)->role.word;
}
/*
*/
if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
mmu->root_level >= PT64_ROOT_4LEVEL)
- return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) &&
- cached_root_available(vcpu, new_pgd, new_role);
+ return cached_root_available(vcpu, new_pgd, new_role);
return false;
}
*/
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa));
+ __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa));
}
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
return role;
}
+ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
+ {
+ /* Use 5-level TDP if and only if it's useful/necessary. */
+ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
+ return 4;
+
+ return max_tdp_level;
+ }
+
static union kvm_mmu_role
kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
{
union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
role.base.ad_disabled = (shadow_accessed_mask == 0);
- role.base.level = vcpu->arch.tdp_level;
+ role.base.level = kvm_mmu_get_tdp_level(vcpu);
role.base.direct = true;
role.base.gpte_is_8_bytes = true;
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
union kvm_mmu_role new_role =
kvm_calc_tdp_mmu_root_page_role(vcpu, false);
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
context->update_pte = nonpaging_update_pte;
- context->shadow_root_level = vcpu->arch.tdp_level;
+ context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
context->direct_map = true;
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
}
static union kvm_mmu_role
- kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only)
{
union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
!is_write_protection(vcpu);
role.base.smap_andnot_wp = role.ext.cr4_smap &&
!is_write_protection(vcpu);
- role.base.direct = !is_paging(vcpu);
role.base.gpte_is_8_bytes = !!is_pae(vcpu);
+ return role;
+ }
+
+ static union kvm_mmu_role
+ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+ {
+ union kvm_mmu_role role =
+ kvm_calc_shadow_root_page_role_common(vcpu, base_only);
+
+ role.base.direct = !is_paging(vcpu);
+
if (!is_long_mode(vcpu))
role.base.level = PT32E_ROOT_LEVEL;
else if (is_la57_mode(vcpu))
return role;
}
- void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ u32 cr0, u32 cr4, u32 efer,
+ union kvm_mmu_role new_role)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
- union kvm_mmu_role new_role =
- kvm_calc_shadow_mmu_root_page_role(vcpu, false);
-
- if (new_role.as_u64 == context->mmu_role.as_u64)
- return;
-
if (!(cr0 & X86_CR0_PG))
nonpaging_init_context(vcpu, context);
else if (efer & EFER_LMA)
context->mmu_role.as_u64 = new_role.as_u64;
reset_shadow_zero_bits_mask(vcpu, context);
}
- EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+ {
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ union kvm_mmu_role new_role =
+ kvm_calc_shadow_mmu_root_page_role(vcpu, false);
+
+ if (new_role.as_u64 != context->mmu_role.as_u64)
+ shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+ }
+
+ static union kvm_mmu_role
+ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
+ {
+ union kvm_mmu_role role =
+ kvm_calc_shadow_root_page_role_common(vcpu, false);
+
+ role.base.direct = false;
+ role.base.level = kvm_mmu_get_tdp_level(vcpu);
+
+ return role;
+ }
+
+ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
+ gpa_t nested_cr3)
+ {
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
+ union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
+
+ context->shadow_root_level = new_role.base.level;
+
+ __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
+
+ if (new_role.as_u64 != context->mmu_role.as_u64)
+ shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+ }
+ EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
static union kvm_mmu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty, gpa_t new_eptp)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
u8 level = vmx_eptp_page_walk_level(new_eptp);
union kvm_mmu_role new_role =
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
kvm_init_shadow_mmu(vcpu,
kvm_read_cr0_bits(vcpu, X86_CR0_PG),
{
int r;
- r = mmu_topup_memory_caches(vcpu);
+ r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
if (r)
goto out;
r = mmu_alloc_roots(vcpu);
* or not since pte prefetch is skiped if it does not have
* enough objects in the cache.
*/
- mmu_topup_memory_caches(vcpu);
+ mmu_topup_memory_caches(vcpu, true);
spin_lock(&vcpu->kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
- void kvm_configure_mmu(bool enable_tdp, int tdp_page_level)
+ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
+ int tdp_huge_page_level)
{
tdp_enabled = enable_tdp;
+ max_tdp_level = tdp_max_root_level;
/*
- * max_page_level reflects the capabilities of KVM's MMU irrespective
+ * max_huge_page_level reflects KVM's MMU capabilities irrespective
* of kernel support, e.g. KVM may be capable of using 1GB pages when
* the kernel is not. But, KVM never creates a page size greater than
* what is used by the kernel for any given HVA, i.e. the kernel's
* capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
*/
if (tdp_enabled)
- max_page_level = tdp_page_level;
+ max_huge_page_level = tdp_huge_page_level;
else if (boot_cpu_has(X86_FEATURE_GBPAGES))
- max_page_level = PG_LEVEL_1G;
+ max_huge_page_level = PG_LEVEL_1G;
else
- max_page_level = PG_LEVEL_2M;
+ max_huge_page_level = PG_LEVEL_2M;
}
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
* SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
* skip allocating the PDP table.
*/
- if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL)
+ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
return 0;
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
uint i;
int ret;
+ vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
+ vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
+ vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
break;
/*
- * Skip invalid pages with a non-zero root count, zapping pages
- * with a non-zero root count will never succeed, i.e. the page
- * will get thrown back on active_mmu_pages and we'll get stuck
- * in an infinite loop.
+ * Invalid pages should never land back on the list of active
+ * pages. Skip the bogus page, otherwise we'll get stuck in an
+ * infinite loop if the page gets put back on the list (again).
*/
- if (sp->role.invalid && sp->root_count)
+ if (WARN_ON(sp->role.invalid))
continue;
/*
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
pfn = spte_to_pfn(*sptep);
/*
spin_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
- if (sp->role.invalid && sp->root_count)
+ if (WARN_ON(sp->role.invalid))
continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
goto unlock;
}
- if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
- freed++;
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
unlock:
spin_unlock(&kvm->mmu_lock);
!(pte & PT_GUEST_DIRTY_MASK)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
#if PTTYPE == PTTYPE_EPT
- if (kvm_arch_write_log_dirty(vcpu, addr))
+ if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr))
return -EINVAL;
#endif
pte |= PT_GUEST_DIRTY_MASK;
{
int ret;
pt_element_t pte;
- pt_element_t __user *uninitialized_var(ptep_user);
+ pt_element_t __user *ptep_user;
gfn_t table_gfn;
u64 pt_access, pte_access;
unsigned index, accessed_dirty, pte_pkey;
u64 *spte;
int i;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
if (sp->role.level > PG_LEVEL_4K)
return;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
/*
* If PFEC.RSVD is set, this is a shadow page fault.
* The bit needs to be cleared before walking guest page tables.
return RET_PF_EMULATE;
}
+ r = mmu_topup_memory_caches(vcpu, true);
+ if (r)
+ return r;
+
vcpu->arch.write_fault_to_shadow_pgtable = false;
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
- if (make_mmu_pages_available(vcpu) < 0)
+ r = make_mmu_pages_available(vcpu);
+ if (r)
goto out_unlock;
r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn,
map_writable, prefault, lpage_disallowed);
* No need to check return value here, rmap_can_add() can
* help us to skip pte prefetch later.
*/
- mmu_topup_memory_caches(vcpu);
+ mmu_topup_memory_caches(vcpu, true);
if (!VALID_PAGE(root_hpa)) {
WARN_ON(1);
level = iterator.level;
sptep = iterator.sptep;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
if (is_last_spte(*sptep, level)) {
pt_element_t gpte;
gpa_t pte_gpa;
asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
}
- static int get_npt_level(struct kvm_vcpu *vcpu)
+ static int get_max_npt_level(void)
{
#ifdef CONFIG_X86_64
return PT64_ROOT_4LEVEL;
}
svm->vmcb->save.efer = efer | EFER_SVME;
- mark_dirty(svm->vmcb, VMCB_CR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
static int is_external_interrupt(u32 info)
pause_filter_count_max);
if (control->pause_filter_count != old) {
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
trace_kvm_ple_window_update(vcpu->vcpu_id,
control->pause_filter_count, old);
}
pause_filter_count_shrink,
pause_filter_count);
if (control->pause_filter_count != old) {
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
trace_kvm_ple_window_update(vcpu->vcpu_id,
control->pause_filter_count, old);
}
if (npt_enabled && !npt)
npt_enabled = false;
- kvm_configure_mmu(npt_enabled, PG_LEVEL_1G);
+ kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
if (nrips) {
svm_set_cpu_caps();
+ /*
+ * It seems that on AMD processors PTE's accessed bit is
+ * being set by the CPU hardware before the NPF vmexit.
+ * This is not expected behaviour and our tests fail because
+ * of it.
+ * A workaround here is to disable support for
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+ * In this case userspace can know if there is support using
+ * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+ * it
+ * If future AMD CPU models change the behaviour described above,
+ * this variable can be changed accordingly
+ */
+ allow_smaller_maxphyaddr = !npt_enabled;
+
return 0;
err:
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
- mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
return svm->vmcb->control.tsc_offset;
}
if (enable_vmware_backdoor)
set_exception_intercept(svm, GP_VECTOR);
- set_intercept(svm, INTERCEPT_INTR);
- set_intercept(svm, INTERCEPT_NMI);
- set_intercept(svm, INTERCEPT_SMI);
- set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
- set_intercept(svm, INTERCEPT_RDPMC);
- set_intercept(svm, INTERCEPT_CPUID);
- set_intercept(svm, INTERCEPT_INVD);
- set_intercept(svm, INTERCEPT_INVLPG);
- set_intercept(svm, INTERCEPT_INVLPGA);
- set_intercept(svm, INTERCEPT_IOIO_PROT);
- set_intercept(svm, INTERCEPT_MSR_PROT);
- set_intercept(svm, INTERCEPT_TASK_SWITCH);
- set_intercept(svm, INTERCEPT_SHUTDOWN);
- set_intercept(svm, INTERCEPT_VMRUN);
- set_intercept(svm, INTERCEPT_VMMCALL);
- set_intercept(svm, INTERCEPT_VMLOAD);
- set_intercept(svm, INTERCEPT_VMSAVE);
- set_intercept(svm, INTERCEPT_STGI);
- set_intercept(svm, INTERCEPT_CLGI);
- set_intercept(svm, INTERCEPT_SKINIT);
- set_intercept(svm, INTERCEPT_WBINVD);
- set_intercept(svm, INTERCEPT_XSETBV);
- set_intercept(svm, INTERCEPT_RDPRU);
- set_intercept(svm, INTERCEPT_RSM);
+ svm_set_intercept(svm, INTERCEPT_INTR);
+ svm_set_intercept(svm, INTERCEPT_NMI);
+ svm_set_intercept(svm, INTERCEPT_SMI);
+ svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ svm_set_intercept(svm, INTERCEPT_RDPMC);
+ svm_set_intercept(svm, INTERCEPT_CPUID);
+ svm_set_intercept(svm, INTERCEPT_INVD);
+ svm_set_intercept(svm, INTERCEPT_INVLPG);
+ svm_set_intercept(svm, INTERCEPT_INVLPGA);
+ svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
+ svm_set_intercept(svm, INTERCEPT_MSR_PROT);
+ svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
+ svm_set_intercept(svm, INTERCEPT_VMRUN);
+ svm_set_intercept(svm, INTERCEPT_VMMCALL);
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ svm_set_intercept(svm, INTERCEPT_CLGI);
+ svm_set_intercept(svm, INTERCEPT_SKINIT);
+ svm_set_intercept(svm, INTERCEPT_WBINVD);
+ svm_set_intercept(svm, INTERCEPT_XSETBV);
+ svm_set_intercept(svm, INTERCEPT_RDPRU);
+ svm_set_intercept(svm, INTERCEPT_RSM);
if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
- set_intercept(svm, INTERCEPT_MONITOR);
- set_intercept(svm, INTERCEPT_MWAIT);
+ svm_set_intercept(svm, INTERCEPT_MONITOR);
+ svm_set_intercept(svm, INTERCEPT_MWAIT);
}
if (!kvm_hlt_in_guest(svm->vcpu.kvm))
- set_intercept(svm, INTERCEPT_HLT);
+ svm_set_intercept(svm, INTERCEPT_HLT);
control->iopm_base_pa = __sme_set(iopm_base);
control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
- clr_intercept(svm, INTERCEPT_INVLPG);
+ svm_clr_intercept(svm, INTERCEPT_INVLPG);
clr_exception_intercept(svm, PF_VECTOR);
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
- if (pause_filter_count) {
+ if (!kvm_pause_in_guest(svm->vcpu.kvm)) {
control->pause_filter_count = pause_filter_count;
if (pause_filter_thresh)
control->pause_filter_thresh = pause_filter_thresh;
- set_intercept(svm, INTERCEPT_PAUSE);
+ svm_set_intercept(svm, INTERCEPT_PAUSE);
} else {
- clr_intercept(svm, INTERCEPT_PAUSE);
+ svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
if (kvm_vcpu_apicv_active(&svm->vcpu))
* in VMCB and clear intercepts to avoid #VMEXIT.
*/
if (vls) {
- clr_intercept(svm, INTERCEPT_VMLOAD);
- clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
}
if (vgif) {
- clr_intercept(svm, INTERCEPT_STGI);
- clr_intercept(svm, INTERCEPT_CLGI);
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ svm_clr_intercept(svm, INTERCEPT_CLGI);
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
clr_exception_intercept(svm, UD_VECTOR);
}
- mark_all_dirty(svm->vmcb);
+ vmcb_mark_all_dirty(svm->vmcb);
enable_gif(svm);
if (unlikely(cpu != vcpu->cpu)) {
svm->asid_generation = 0;
- mark_all_dirty(svm->vmcb);
+ vmcb_mark_all_dirty(svm->vmcb);
}
#ifdef CONFIG_X86_64
/* The following fields are ignored when AVIC is enabled */
WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
- set_intercept(svm, INTERCEPT_VINTR);
+ svm_set_intercept(svm, INTERCEPT_VINTR);
/*
* This is just a dummy VINTR to actually cause a vmexit to happen.
control->int_ctl &= ~V_INTR_PRIO_MASK;
control->int_ctl |= V_IRQ_MASK |
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
- mark_dirty(svm->vmcb, VMCB_INTR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
}
static void svm_clear_vintr(struct vcpu_svm *svm)
{
const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK;
- clr_intercept(svm, INTERCEPT_VINTR);
+ svm_clr_intercept(svm, INTERCEPT_VINTR);
/* Drop int_ctl fields related to VINTR injection. */
svm->vmcb->control.int_ctl &= mask;
svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask;
}
- mark_dirty(svm->vmcb, VMCB_INTR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
}
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
svm->vmcb->save.idtr.limit = dt->size;
svm->vmcb->save.idtr.base = dt->address ;
- mark_dirty(svm->vmcb, VMCB_DT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
svm->vmcb->save.gdtr.limit = dt->size;
svm->vmcb->save.gdtr.base = dt->address ;
- mark_dirty(svm->vmcb, VMCB_DT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
static void update_cr0_intercept(struct vcpu_svm *svm)
*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
| (gcr0 & SVM_CR0_SELECTIVE_MASK);
- mark_dirty(svm->vmcb, VMCB_CR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
if (gcr0 == *hcr0) {
clr_cr_intercept(svm, INTERCEPT_CR0_READ);
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
- mark_dirty(svm->vmcb, VMCB_CR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
update_cr0_intercept(svm);
}
cr4 |= X86_CR4_PAE;
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
- mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
return 0;
}
/* This is symmetric with svm_get_segment() */
svm->vmcb->save.cpl = (var->dpl & 3);
- mark_dirty(svm->vmcb, VMCB_SEG);
+ vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
}
- static void update_bp_intercept(struct kvm_vcpu *vcpu)
+ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
set_exception_intercept(svm, BP_VECTOR);
- } else
- vcpu->guest_debug = 0;
+ }
}
static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid_generation = sd->asid_generation;
svm->vmcb->control.asid = sd->next_asid++;
- mark_dirty(svm->vmcb, VMCB_ASID);
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
}
static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
if (unlikely(value != vmcb->save.dr6)) {
vmcb->save.dr6 = value;
- mark_dirty(vmcb, VMCB_DR);
+ vmcb_mark_dirty(vmcb, VMCB_DR);
}
}
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.dr7 = value;
- mark_dirty(svm->vmcb, VMCB_DR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}
static int pf_interception(struct vcpu_svm *svm)
* again while processing KVM_REQ_EVENT if needed.
*/
if (vgif_enabled(svm))
- clr_intercept(svm, INTERCEPT_STGI);
- if (is_intercept(svm, INTERCEPT_VINTR))
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ if (svm_is_intercept(svm, INTERCEPT_VINTR))
svm_clear_vintr(svm);
enable_gif(svm);
static int iret_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.nmi_window_exits;
- clr_intercept(svm, INTERCEPT_IRET);
+ svm_clr_intercept(svm, INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ return 0;
default:
- return 1;
+ return KVM_MSR_RET_INVALID;
}
return 0;
return 1;
vcpu->arch.pat = data;
svm->vmcb->save.g_pat = data;
- mark_dirty(svm->vmcb, VMCB_NPT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
return 1;
- if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
+ if (kvm_spec_ctrl_test_value(data))
return 1;
svm->spec_ctrl = data;
return 1;
svm->vmcb->save.dbgctl = data;
- mark_dirty(svm->vmcb, VMCB_LBR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
if (data & (1ULL<<0))
svm_enable_lbrv(svm);
else
struct kvm_vcpu *vcpu = &svm->vcpu;
bool in_kernel = (svm_get_cpl(vcpu) == 0);
- if (pause_filter_thresh)
+ if (!kvm_pause_in_guest(vcpu->kvm))
grow_ple_window(vcpu);
kvm_vcpu_on_spin(vcpu, in_kernel);
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
= svm->vmcb->control.exit_code;
+ kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
dump_vmcb(vcpu);
return 0;
}
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 1;
+ vcpu->run->internal.ndata = 2;
vcpu->run->internal.data[0] = exit_code;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
return 0;
}
static void reload_tss(struct kvm_vcpu *vcpu)
{
- int cpu = raw_smp_processor_id();
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
sd->tss_desc->type = 9; /* available 32/64-bit TSS */
load_TR_desc();
}
static void pre_svm_run(struct vcpu_svm *svm)
{
- int cpu = raw_smp_processor_id();
-
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu);
if (sev_guest(svm->vcpu.kvm))
- return pre_sev_run(svm, cpu);
+ return pre_sev_run(svm, svm->vcpu.cpu);
/* FIXME: handle wraparound of asid_generation */
if (svm->asid_generation != sd->asid_generation)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- set_intercept(svm, INTERCEPT_IRET);
+ svm_set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm_nested_virtualize_tpr(vcpu))
+ if (nested_svm_virtualize_tpr(vcpu))
return;
clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
if (masked) {
svm->vcpu.arch.hflags |= HF_NMI_MASK;
- set_intercept(svm, INTERCEPT_IRET);
+ svm_set_intercept(svm, INTERCEPT_IRET);
} else {
svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- clr_intercept(svm, INTERCEPT_IRET);
+ svm_clr_intercept(svm, INTERCEPT_IRET);
}
}
if (!gif_set(svm)) {
if (vgif_enabled(svm))
- set_intercept(svm, INTERCEPT_STGI);
+ svm_set_intercept(svm, INTERCEPT_STGI);
return; /* STGI will cause a vm exit */
}
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm_nested_virtualize_tpr(vcpu))
+ if (nested_svm_virtualize_tpr(vcpu))
return;
if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
- if (svm_nested_virtualize_tpr(vcpu) ||
+ if (nested_svm_virtualize_tpr(vcpu) ||
kvm_vcpu_apicv_active(vcpu))
return;
void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs);
+ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ struct vcpu_svm *svm)
+ {
+ /*
+ * VMENTER enables interrupts (host state), but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about
+ * it. This is the same logic as for exit_to_user_mode().
+ *
+ * This ensures that e.g. latency analysis on the host observes
+ * guest mode as interrupt enabled.
+ *
+ * guest_enter_irqoff() informs context tracking about the
+ * transition to guest mode and if enabled adjusts RCU state
+ * accordingly.
+ */
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ guest_enter_irqoff();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+
+ __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
+
+ #ifdef CONFIG_X86_64
+ native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+ #else
+ loadsegment(fs, svm->host.fs);
+ #ifndef CONFIG_X86_32_LAZY_GS
+ loadsegment(gs, svm->host.gs);
+ #endif
+ #endif
+
+ /*
+ * VMEXIT disables interrupts (host state), but tracing and lockdep
+ * have them in state 'on' as recorded before entering guest mode.
+ * Same as enter_from_user_mode().
+ *
+ * guest_exit_irqoff() restores host context and reinstates RCU if
+ * enabled and required.
+ *
+ * This needs to be done before the below as native_read_msr()
+ * contains a tracepoint and x86_spec_ctrl_restore_host() calls
+ * into world and some more.
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ guest_exit_irqoff();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+ }
+
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
{
fastpath_t exit_fastpath;
*/
x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
- __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
-
- #ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, svm->host.gs_base);
- #else
- loadsegment(fs, svm->host.fs);
- #ifndef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
- #endif
- #endif
+ svm_vcpu_enter_exit(vcpu, svm);
/*
* We do not use IBRS in the kernel. If this vCPU has used the
SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
- mark_all_clean(svm->vmcb);
+ vmcb_mark_all_clean(svm->vmcb);
return exit_fastpath;
}
- static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root)
+ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root,
+ int root_level)
{
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long cr3;
cr3 = __sme_set(root);
if (npt_enabled) {
svm->vmcb->control.nested_cr3 = cr3;
- mark_dirty(svm->vmcb, VMCB_NPT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
/* Loading L2's CR3 is handled by enter_svm_guest_mode. */
if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
}
svm->vmcb->save.cr3 = cr3;
- mark_dirty(svm->vmcb, VMCB_CR);
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
static int is_disabled(void)
return 0;
}
- static void svm_cpuid_update(struct kvm_vcpu *vcpu)
+ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
- if (pause_filter_thresh)
+ if (!kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
}
struct kvm_host_map map;
u64 guest;
u64 vmcb;
+ int ret = 0;
guest = GET_SMSTATE(u64, smstate, 0x7ed8);
vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
return 1;
nested_vmcb = map.hva;
- enter_svm_guest_mode(svm, vmcb, nested_vmcb);
+ ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb);
kvm_vcpu_unmap(&svm->vcpu, &map, true);
}
- return 0;
+
+ return ret;
}
static void enable_smi_window(struct kvm_vcpu *vcpu)
if (!gif_set(svm)) {
if (vgif_enabled(svm))
- set_intercept(svm, INTERCEPT_STGI);
+ svm_set_intercept(svm, INTERCEPT_STGI);
/* STGI will cause a vm exit */
} else {
/* We must be in SMM; RSM will cause a vmexit anyway. */
static int svm_vm_init(struct kvm *kvm)
{
+ if (!pause_filter_count || !pause_filter_thresh)
+ kvm->arch.pause_in_guest = true;
+
if (avic) {
int ret = avic_vm_init(kvm);
if (ret)
.vcpu_blocking = svm_vcpu_blocking,
.vcpu_unblocking = svm_vcpu_unblocking,
- .update_bp_intercept = update_bp_intercept,
+ .update_exception_bitmap = update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
.set_tss_addr = svm_set_tss_addr,
.set_identity_map_addr = svm_set_identity_map_addr,
- .get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
.get_exit_info = svm_get_exit_info,
- .cpuid_update = svm_cpuid_update,
+ .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
.has_wbinvd_exit = svm_has_wbinvd_exit,
static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
u32 vm_instruction_error)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- /*
- * failValid writes the error number to the current VMCS, which
- * can't be done if there isn't a current VMCS.
- */
- if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
- return nested_vmx_failInvalid(vcpu);
-
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
X86_EFLAGS_SF | X86_EFLAGS_OF))
return kvm_skip_emulated_instruction(vcpu);
}
+ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
+ {
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * failValid writes the error number to the current VMCS, which
+ * can't be done if there isn't a current VMCS.
+ */
+ if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+ return nested_vmx_failInvalid(vcpu);
+
+ return nested_vmx_failValid(vcpu, vm_instruction_error);
+ }
+
static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{
/* TODO: not to reset guest simply here. */
* consistency checks.
*/
if (enable_ept && nested_early_check)
- vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
+ vmcs_write64(EPT_POINTER,
+ construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
/* All VMFUNCs are currently emulated through L0 vmexits. */
if (cpu_has_vmx_vmfunc())
/*
* Whether page-faults are trapped is determined by a combination of
- * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
- * If enable_ept, L0 doesn't care about page faults and we should
- * set all of these to L1's desires. However, if !enable_ept, L0 does
- * care about (at least some) page faults, and because it is not easy
- * (if at all possible?) to merge L0 and L1's desires, we simply ask
- * to exit on each and every L2 page fault. This is done by setting
- * MASK=MATCH=0 and (see below) EB.PF=1.
+ * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
+ * doesn't care about page faults then we should set all of these to
+ * L1's desires. However, if L0 does care about (some) page faults, it
+ * is not easy (if at all possible?) to merge L0 and L1's desires, we
+ * simply ask to exit on each and every L2 page fault. This is done by
+ * setting MASK=MATCH=0 and (see below) EB.PF=1.
* Note that below we don't need special code to set EB.PF beyond the
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
*/
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
- enable_ept ? vmcs12->page_fault_error_code_mask : 0);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
- enable_ept ? vmcs12->page_fault_error_code_match : 0);
+ if (vmx_need_pf_intercept(&vmx->vcpu)) {
+ /*
+ * TODO: if both L0 and L1 need the same MASK and MATCH,
+ * go ahead and use it?
+ */
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+ } else {
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
+ }
if (cpu_has_vmx_apicv()) {
vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
return true;
}
+ static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
+ {
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t dst;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+ return 0;
+
+ if (WARN_ON_ONCE(vmx->nested.pml_full))
+ return 1;
+
+ /*
+ * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
+ * set is already checked as part of A/D emulation.
+ */
+ vmcs12 = get_vmcs12(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ vmx->nested.pml_full = true;
+ return 1;
+ }
+
+ gpa &= ~0xFFFull;
+ dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
+
+ if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+ offset_in_page(dst), sizeof(gpa)))
+ return 0;
+
+ vmcs12->guest_pml_index--;
+
+ return 0;
+ }
+
/*
* Intel's VMX Instruction Reference specifies a common set of prerequisites
* for running VMX instructions (except VMXON, whose prerequisites are
* when using the merged vmcs02.
*/
if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
- return nested_vmx_failValid(vcpu,
- VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
if (vmcs12->launch_state == launch)
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
if (nested_vmx_check_controls(vcpu, vmcs12))
- return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
if (nested_vmx_check_host_state(vcpu, vmcs12))
- return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
/*
* We're finally done with prerequisite checking, and can start with
if (status == NVMX_VMENTRY_VMEXIT)
return 1;
WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
- return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
}
/*
* flag and the VM-instruction error field of the VMCS
* accordingly, and skip the emulated instruction.
*/
- (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
/*
* Restore L1's host state to KVM's software model. We're here
}
if (vmx->nested.vmxon)
- return nested_vmx_failValid(vcpu,
- VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+ return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
return r;
if (!page_address_valid(vcpu, vmptr))
- return nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_INVALID_ADDRESS);
+ return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
if (vmptr == vmx->nested.vmxon_ptr)
- return nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_VMXON_POINTER);
+ return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
/*
* When Enlightened VMEntry is enabled on the calling CPU we treat
offset = vmcs_field_to_offset(field);
if (offset < 0)
- return nested_vmx_failValid(vcpu,
- VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
offset = vmcs_field_to_offset(field);
if (offset < 0)
- return nested_vmx_failValid(vcpu,
- VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
/*
* If the vCPU supports "VMWRITE to any supported field in the
*/
if (vmcs_field_readonly(field) &&
!nested_cpu_has_vmwrite_any_field(vcpu))
- return nested_vmx_failValid(vcpu,
- VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+ return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
/*
* Ensure vmcs12 is up-to-date before any VMWRITE that dirties
return r;
if (!page_address_valid(vcpu, vmptr))
- return nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_INVALID_ADDRESS);
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
if (vmptr == vmx->nested.vmxon_ptr)
- return nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_VMXON_POINTER);
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
/* Forbid normal VMPTRLD if Enlightened version was used */
if (vmx->nested.hv_evmcs)
* given physical address won't match the required
* VMCS12_REVISION identifier.
*/
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
(new_vmcs12->hdr.shadow_vmcs &&
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
kvm_vcpu_unmap(vcpu, &map, false);
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
if (type >= 32 || !(types & (1 << type)))
- return nested_vmx_failValid(vcpu,
- VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
/* According to the Intel VMX instruction reference, the memory
* operand is read even if it isn't needed (e.g., for type==global)
switch (type) {
case VMX_EPT_EXTENT_CONTEXT:
if (!nested_vmx_check_eptp(vcpu, operand.eptp))
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
roots_to_free = 0;
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
if (type >= 32 || !(types & (1 << type)))
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
/* according to the intel vmx instruction reference, the memory
return vmx_handle_memory_failure(vcpu, r, &e);
if (operand.vpid >> 16)
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid02 = nested_get_vpid02(vcpu);
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
if (!operand.vpid ||
is_noncanonical_address(operand.gla, vcpu))
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid_sync_vcpu_addr(vpid02, operand.gla);
break;
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
if (!operand.vpid)
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid_sync_context(vpid02);
break;
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
+ if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
+ return -EINVAL;
+
/*
* SMM temporarily disables VMX, so we cannot be in guest mode,
* nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
if (ret)
return ret;
- /* Empty 'VMXON' state is permitted */
- if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
- return 0;
+ /* Empty 'VMXON' state is permitted if no VMCS loaded */
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
+ /* See vmx_has_valid_vmcs12. */
+ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
+ (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
+ (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
+ return -EINVAL;
+ else
+ return 0;
+ }
if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
goto error_guest_mode;
}
+ vmx->nested.has_preemption_timer_deadline = false;
if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
vmx->nested.has_preemption_timer_deadline = true;
vmx->nested.preemption_timer_deadline =
/*
* secondary cpu-based controls. Do not include those that
- * depend on CPUID bits, they are added later by vmx_cpuid_update.
+ * depend on CPUID bits, they are added later by
+ * vmx_vcpu_after_set_cpuid.
*/
if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
.get_state = vmx_get_nested_state,
.set_state = vmx_set_nested_state,
.get_vmcs12_pages = nested_get_vmcs12_pages,
+ .write_log_dirty = nested_vmx_write_pml_buffer,
.enable_evmcs = nested_enable_evmcs,
.get_evmcs_version = nested_get_evmcs_version,
};
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
+#include <linux/entry-kvm.h>
#include <asm/apic.h>
#include <asm/asm.h>
eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
- if (enable_ept)
+ if (!vmx_need_pf_intercept(vcpu))
eb &= ~(1u << PF_VECTOR);
/* When we are running a nested L2 guest and L1 specified for it a
gs_base = cpu_kernelmode_gs_base(cpu);
if (likely(is_64bit_mm(current->mm))) {
- save_fsgs_for_kvm();
+ current_save_fsgs();
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
msr->data = vmx_get_perf_capabilities();
return 0;
default:
- return 1;
+ return KVM_MSR_RET_INVALID;
}
}
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
- if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
+ if (kvm_spec_ctrl_test_value(data))
return 1;
vmx->spec_ctrl = data;
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
- u64 root_hpa = vcpu->arch.mmu->root_hpa;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 root_hpa = mmu->root_hpa;
/* No flush required if the current context is invalid. */
if (!VALID_PAGE(root_hpa))
return;
if (enable_ept)
- ept_sync_context(construct_eptp(vcpu, root_hpa));
+ ept_sync_context(construct_eptp(vcpu, root_hpa,
+ mmu->shadow_root_level));
else if (!is_guest_mode(vcpu))
vpid_sync_context(to_vmx(vcpu)->vpid);
else
vmx->emulation_required = emulation_required(vcpu);
}
- static int vmx_get_tdp_level(struct kvm_vcpu *vcpu)
+ static int vmx_get_max_tdp_level(void)
{
- if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
+ if (cpu_has_vmx_ept_5levels())
return 5;
return 4;
}
- static int get_ept_level(struct kvm_vcpu *vcpu)
- {
- if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
- return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
-
- return vmx_get_tdp_level(vcpu);
- }
-
- u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
+ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
+ int root_level)
{
u64 eptp = VMX_EPTP_MT_WB;
- eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+ eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
if (enable_ept_ad_bits &&
(!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
return eptp;
}
- void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd)
+ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
+ int pgd_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
u64 eptp;
if (enable_ept) {
- eptp = construct_eptp(vcpu, pgd);
+ eptp = construct_eptp(vcpu, pgd, pgd_level);
vmcs_write64(EPT_POINTER, eptp);
if (kvm_x86_ops.tlb_remote_flush) {
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
+
+ /*
+ * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
+ * between guest and host. In that case we only care about present
+ * faults.
+ */
+ if (enable_ept) {
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK);
+ }
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
!(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
- vcpu->run->internal.ndata = 3;
+ vcpu->run->internal.ndata = 4;
vcpu->run->internal.data[0] = vect_info;
vcpu->run->internal.data[1] = intr_info;
vcpu->run->internal.data[2] = error_code;
+ vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
return 0;
}
if (is_page_fault(intr_info)) {
cr2 = vmx_get_exit_qual(vcpu);
- /* EPT won't cause page fault directly */
- WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept);
- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+ if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+ /*
+ * EPT will cause page fault only if we need to
+ * detect illegal GPAs.
+ */
+ kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
+ return 1;
+ } else
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
vcpu->arch.exit_qualification = exit_qualification;
+
+ /*
+ * Check that the GPA doesn't exceed physical memory limits, as that is
+ * a guest page fault. We have to emulate the instruction here, because
+ * if the illegal address is that of a paging structure, then
+ * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
+ * would also use advanced VM-exit information for EPT violations to
+ * reconstruct the page fault error code.
+ */
+ if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+ return kvm_emulate_instruction(vcpu, 0);
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
}
/*
- * Note, return 1 and not 0, vcpu_run() is responsible for
- * morphing the pending signal into the proper return code.
+ * Note, return 1 and not 0, vcpu_run() will invoke
+ * xfer_to_guest_mode() which will create a proper return
+ * code.
*/
- if (signal_pending(current))
+ if (__xfer_to_guest_mode_work_pending())
return 1;
-
- if (need_resched())
- schedule();
}
return 1;
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason;
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
vcpu->run->internal.data[3] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
}
+ vcpu->run->internal.data[vcpu->run->internal.ndata++] =
+ vcpu->arch.last_vmentry_cpu;
return 0;
}
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 1;
+ vcpu->run->internal.ndata = 2;
vcpu->run->internal.data[0] = exit_reason;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
return 0;
}
* information but as all relevant affected CPUs have 32KiB L1D cache size
* there is no point in doing so.
*/
- static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{
int size = PAGE_SIZE << L1D_CACHE_ORDER;
vcpu->stat.l1d_flush++;
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
return;
}
}
}
- void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
+ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
{
if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
vmx->loaded_vmcs->host_state.rsp = host_rsp;
bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
+ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ struct vcpu_vmx *vmx)
+ {
+ /*
+ * VMENTER enables interrupts (host state), but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about
+ * it. This is the same logic as for exit_to_user_mode().
+ *
+ * This ensures that e.g. latency analysis on the host observes
+ * guest mode as interrupt enabled.
+ *
+ * guest_enter_irqoff() informs context tracking about the
+ * transition to guest mode and if enabled adjusts RCU state
+ * accordingly.
+ */
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ guest_enter_irqoff();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+ else if (static_branch_unlikely(&mds_user_clear))
+ mds_clear_cpu_buffers();
+
+ if (vcpu->arch.cr2 != native_read_cr2())
+ native_write_cr2(vcpu->arch.cr2);
+
+ vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ vmx->loaded_vmcs->launched);
+
+ vcpu->arch.cr2 = native_read_cr2();
+
+ /*
+ * VMEXIT disables interrupts (host state), but tracing and lockdep
+ * have them in state 'on' as recorded before entering guest mode.
+ * Same as enter_from_user_mode().
+ *
+ * guest_exit_irqoff() restores host context and reinstates RCU if
+ * enabled and required.
+ *
+ * This needs to be done before the below as native_read_msr()
+ * contains a tracepoint and x86_spec_ctrl_restore_host() calls
+ * into world and some more.
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ guest_exit_irqoff();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+ }
+
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
fastpath_t exit_fastpath;
*/
x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
- /* L1D Flush includes CPU buffer clear to mitigate MDS */
- if (static_branch_unlikely(&vmx_l1d_should_flush))
- vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&mds_user_clear))
- mds_clear_cpu_buffers();
-
- if (vcpu->arch.cr2 != read_cr2())
- write_cr2(vcpu->arch.cr2);
-
- vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- vmx->loaded_vmcs->launched);
-
- vcpu->arch.cr2 = read_cr2();
+ /* The actual VMENTER/EXIT is in the .noinstr.text section. */
+ vmx_vcpu_enter_exit(vcpu, vmx);
/*
* We do not use IBRS in the kernel. If this vCPU has used the
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
- static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
+ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
kvm_flush_pml_buffers(kvm);
}
- static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
- {
- struct vmcs12 *vmcs12;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- gpa_t dst;
-
- if (is_guest_mode(vcpu)) {
- WARN_ON_ONCE(vmx->nested.pml_full);
-
- /*
- * Check if PML is enabled for the nested guest.
- * Whether eptp bit 6 is set is already checked
- * as part of A/D emulation.
- */
- vmcs12 = get_vmcs12(vcpu);
- if (!nested_cpu_has_pml(vmcs12))
- return 0;
-
- if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
- vmx->nested.pml_full = true;
- return 1;
- }
-
- gpa &= ~0xFFFull;
- dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
-
- if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
- offset_in_page(dst), sizeof(gpa)))
- return 0;
-
- vmcs12->guest_pml_index--;
- }
-
- return 0;
- }
-
static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *memslot,
gfn_t offset, unsigned long mask)
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .update_bp_intercept = update_exception_bitmap,
+ .update_exception_bitmap = update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.set_tss_addr = vmx_set_tss_addr,
.set_identity_map_addr = vmx_set_identity_map_addr,
- .get_tdp_level = vmx_get_tdp_level,
.get_mt_mask = vmx_get_mt_mask,
.get_exit_info = vmx_get_exit_info,
- .cpuid_update = vmx_cpuid_update,
+ .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
- .write_log_dirty = vmx_write_pml_buffer,
.pre_block = vmx_pre_block,
.post_block = vmx_post_block,
ept_lpage_level = PG_LEVEL_2M;
else
ept_lpage_level = PG_LEVEL_4K;
- kvm_configure_mmu(enable_ept, ept_lpage_level);
+ kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
/*
* Only enable PML when hardware supports PML feature, and both EPT
#endif
vmx_check_vmcs12_offsets();
+ /*
+ * Intel processors don't have problems with
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
+ * it for VMX by default
+ */
+ allow_smaller_maxphyaddr = true;
+
return 0;
}
module_init(vmx_init);
#include <linux/sched/stat.h>
#include <linux/sched/isolation.h>
#include <linux/mem_encrypt.h>
+#include <linux/entry-kvm.h>
#include <trace/events/kvm.h>
u64 __read_mostly host_efer;
EXPORT_SYMBOL_GPL(host_efer);
+ bool __read_mostly allow_smaller_maxphyaddr;
+ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+
static u64 __read_mostly host_xss;
u64 __read_mostly supported_xss;
EXPORT_SYMBOL_GPL(supported_xss);
static struct kmem_cache *x86_emulator_cache;
+ /*
+ * When called, it means the previous get/set msr reached an invalid msr.
+ * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want
+ * to fail the caller.
+ */
+ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
+ u64 data, bool write)
+ {
+ const char *op = write ? "wrmsr" : "rdmsr";
+
+ if (ignore_msrs) {
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ /* Mask the error */
+ return 0;
+ } else {
+ vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ return 1;
+ }
+ }
+
static struct kmem_cache *kvm_alloc_emulator_cache(void)
{
unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
- asmlinkage __visible void kvm_spurious_fault(void)
+ asmlinkage __visible noinstr void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
BUG_ON(!kvm_rebooting);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
return 1;
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+ if (cr0 & X86_CR0_PG) {
#ifdef CONFIG_X86_64
- if ((vcpu->arch.efer & EFER_LME)) {
+ if (!is_paging(vcpu) && (vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
if (!is_pae(vcpu))
return 1;
} else
#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
+ if (is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
return 1;
}
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
return 0;
}
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
- #define __cr4_reserved_bits(__cpu_has, __c) \
- ({ \
- u64 __reserved_bits = CR4_RESERVED_BITS; \
- \
- if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \
- __reserved_bits |= X86_CR4_OSXSAVE; \
- if (!__cpu_has(__c, X86_FEATURE_SMEP)) \
- __reserved_bits |= X86_CR4_SMEP; \
- if (!__cpu_has(__c, X86_FEATURE_SMAP)) \
- __reserved_bits |= X86_CR4_SMAP; \
- if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \
- __reserved_bits |= X86_CR4_FSGSBASE; \
- if (!__cpu_has(__c, X86_FEATURE_PKU)) \
- __reserved_bits |= X86_CR4_PKE; \
- if (!__cpu_has(__c, X86_FEATURE_LA57)) \
- __reserved_bits |= X86_CR4_LA57; \
- if (!__cpu_has(__c, X86_FEATURE_UMIP)) \
- __reserved_bits |= X86_CR4_UMIP; \
- __reserved_bits; \
- })
-
- static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+ int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & cr4_reserved_bits)
return -EINVAL;
- if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu))
+ if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
return -EINVAL;
return 0;
}
+ EXPORT_SYMBOL_GPL(kvm_valid_cr4);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
kvm_mmu_reset_context(vcpu);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
return 0;
}
case 4:
/* fall through */
case 6:
- if (val & 0xffffffff00000000ULL)
+ if (!kvm_dr6_valid(val))
return -1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
break;
rdmsrl_safe(msr->index, &msr->data);
break;
default:
- if (kvm_x86_ops.get_msr_feature(msr))
- return 1;
+ return kvm_x86_ops.get_msr_feature(msr);
}
return 0;
}
msr.index = index;
r = kvm_get_msr_feature(&msr);
+
+ if (r == KVM_MSR_RET_INVALID) {
+ /* Unconditionally clear the output for simplicity */
+ *data = 0;
+ r = kvm_msr_ignored_check(vcpu, index, 0, false);
+ }
+
if (r)
return r;
return kvm_x86_ops.set_msr(vcpu, &msr);
}
+ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 data, bool host_initiated)
+ {
+ int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
+
+ if (ret == KVM_MSR_RET_INVALID)
+ ret = kvm_msr_ignored_check(vcpu, index, data, true);
+
+ return ret;
+ }
+
/*
* Read the MSR specified by @index into @data. Select MSR specific fault
* checks are bypassed if @host_initiated is %true.
return ret;
}
+ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 *data, bool host_initiated)
+ {
+ int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
+
+ if (ret == KVM_MSR_RET_INVALID) {
+ /* Unconditionally clear *data for simplicity */
+ *data = 0;
+ ret = kvm_msr_ignored_check(vcpu, index, 0, false);
+ }
+
+ return ret;
+ }
+
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
- return __kvm_get_msr(vcpu, index, data, false);
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
}
EXPORT_SYMBOL_GPL(kvm_get_msr);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
{
- return __kvm_set_msr(vcpu, index, data, false);
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
- need_resched() || signal_pending(current);
+ xfer_to_guest_mode_work_pending();
}
EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
*/
static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- return __kvm_get_msr(vcpu, index, data, true);
+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
}
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- return __kvm_set_msr(vcpu, index, *data, true);
+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
}
#ifdef CONFIG_X86_64
return 1;
vcpu->arch.arch_capabilities = data;
break;
+ case MSR_IA32_PERF_CAPABILITIES: {
+ struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
+
+ if (!msr_info->host_initiated)
+ return 1;
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
+ return 1;
+ if (data & ~msr_ent.data)
+ return 1;
+
+ vcpu->arch.perf_capabilities = data;
+
+ return 0;
+ }
case MSR_EFER:
return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
return 1;
vcpu->arch.ia32_misc_enable_msr = data;
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
} else {
vcpu->arch.ia32_misc_enable_msr = data;
}
return xen_hvm_config(vcpu, data);
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- if (!ignore_msrs) {
- vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
- return 1;
- } else {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu,
- "ignored wrmsr: 0x%x data 0x%llx\n",
- msr, data);
- break;
- }
+ return KVM_MSR_RET_INVALID;
}
return 0;
}
return 1;
msr_info->data = vcpu->arch.arch_capabilities;
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return 1;
+ msr_info->data = vcpu->arch.perf_capabilities;
+ break;
case MSR_IA32_POWER_CTL:
msr_info->data = vcpu->arch.msr_ia32_power_ctl;
break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
- if (!ignore_msrs) {
- vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
- msr_info->index);
- return 1;
- } else {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
- msr_info->index);
- msr_info->data = 0;
- }
- break;
+ return KVM_MSR_RET_INVALID;
}
return 0;
}
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
case KVM_CAP_SET_GUEST_DEBUG:
+ case KVM_CAP_LAST_CPU:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
break;
+ case KVM_CAP_SMALLER_MAXPHYADDR:
+ r = (int) allow_smaller_maxphyaddr;
+ break;
default:
break;
}
kvm_x86_ops.set_efer(vcpu, 0);
#endif
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
kvm_mmu_reset_context(vcpu);
}
}
trace_kvm_entry(vcpu->vcpu_id);
- guest_enter_irqoff();
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
if (hw_breakpoint_active())
hw_breakpoint_restore();
+ vcpu->arch.last_vmentry_cpu = vcpu->cpu;
vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
local_irq_disable();
kvm_after_interrupt(vcpu);
- guest_exit_irqoff();
if (lapic_in_kernel(vcpu)) {
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
if (delta != S64_MIN) {
break;
}
- if (signal_pending(current)) {
- r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- break;
- }
- if (need_resched()) {
+ if (__xfer_to_guest_mode_work_pending()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- cond_resched();
+ r = xfer_to_guest_mode_handle_work(vcpu);
+ if (r)
+ return r;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
(X86_CR4_OSXSAVE | X86_CR4_PKE));
kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
if (cpuid_update_needed)
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops.update_bp_intercept(vcpu);
+ kvm_x86_ops.update_exception_bitmap(vcpu);
r = 0;
fx_init(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu);
vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
{
int i, r;
- unsigned long hva, uninitialized_var(old_npages);
+ unsigned long hva, old_npages;
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *slot;
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
- u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
+
+ int kvm_spec_ctrl_test_value(u64 value)
{
- uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD;
+ /*
+ * test that setting IA32_SPEC_CTRL to given value
+ * is allowed by the host processor
+ */
- /* The STIBP bit doesn't fault even if it's not advertised */
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
- bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
- if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
- !boot_cpu_has(X86_FEATURE_AMD_IBRS))
- bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
+ u64 saved_value;
+ unsigned long flags;
+ int ret = 0;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
- bits &= ~SPEC_CTRL_SSBD;
- if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
- !boot_cpu_has(X86_FEATURE_AMD_SSBD))
- bits &= ~SPEC_CTRL_SSBD;
+ local_irq_save(flags);
+
+ if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
+ ret = 1;
+ else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
+ ret = 1;
+ else
+ wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
+
+ local_irq_restore(flags);
- return bits;
+ return ret;
+ }
+ EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
+
+ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
+ {
+ struct x86_exception fault;
+
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, &fault) != UNMAPPED_GVA) {
+ /*
+ * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+ * tables probably do not match the TLB. Just proceed
+ * with the error code that the processor gave.
+ */
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = error_code;
+ fault.nested_page_fault = false;
+ fault.address = gva;
+ }
+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
}
- EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
+ EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
- int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- unsigned long hva, struct kvm_arch_async_pf *arch);
+ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ unsigned long hva, struct kvm_arch_async_pf *arch);
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#endif
* Array indexed by gsi. Each entry contains list of irq chips
* the gsi is connected to.
*/
- struct hlist_head map[0];
+ struct hlist_head map[];
};
#endif
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
+ bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
void kvm_flush_remote_tlbs(struct kvm *kvm);
void kvm_reload_remote_mmus(struct kvm *kvm);
+ #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
+ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
+ int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
+ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
+ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
+ #endif
+
bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
struct kvm_vcpu *except,
unsigned long *vcpu_bitmap, cpumask_var_t tmp);
uintptr_t data, const char *name,
struct task_struct **thread_ptr);
+#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
+static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ vcpu->stat.signal_exits++;
+}
+#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
+
#endif