#ifndef __ASM_S390_PROCESSOR_H
#define __ASM_S390_PROCESSOR_H
+#ifndef __ASSEMBLY__
+
#include <linux/linkage.h>
#include <linux/irqflags.h>
#include <asm/cpu.h>
#include <asm/page.h>
#include <asm/ptrace.h>
#include <asm/setup.h>
+#include <asm/runtime_instr.h>
/*
* Default implementation of macro that returns current
unsigned long gmap_addr; /* address of last gmap fault. */
struct per_regs per_user; /* User specified PER registers */
struct per_event per_event; /* Cause of the last PER trap */
+ unsigned long per_flags; /* Flags to control debug behavior */
/* pfault_wait is used to block the process on a pfault event */
unsigned long pfault_wait;
struct list_head list;
+ /* cpu runtime instrumentation */
+ struct runtime_instr_cb *ri_cb;
+ int ri_signum;
+#ifdef CONFIG_64BIT
+ unsigned char trap_tdb[256]; /* Transaction abort diagnose block */
+#endif
};
+#define PER_FLAG_NO_TE 1UL /* Flag to disable transactions. */
+
typedef struct thread_struct thread_struct;
/*
struct mm_struct;
struct seq_file;
+#ifdef CONFIG_64BIT
+extern void show_cacheinfo(struct seq_file *m);
+#else
+static inline void show_cacheinfo(struct seq_file *m) { }
+#endif
+
/* Free all resources held by a thread. */
extern void release_thread(struct task_struct *);
extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
extern unsigned long thread_saved_pc(struct task_struct *t);
extern void show_code(struct pt_regs *regs);
+extern void print_fn_code(unsigned char *code, unsigned long len);
+ extern int insn_to_mnemonic(unsigned char *instruction, char buf[8]);
unsigned long get_wchan(struct task_struct *p);
#define task_pt_regs(tsk) ((struct pt_regs *) \
#define ARCH_LOW_ADDRESS_LIMIT 0x7fffffffUL
-/*
- * Helper macro for exception table entries
- */
-#ifndef CONFIG_64BIT
-#define EX_TABLE(_fault,_target) \
- ".section __ex_table,\"a\"\n" \
- " .align 4\n" \
- " .long " #_fault "," #_target "\n" \
- ".previous\n"
-#else
-#define EX_TABLE(_fault,_target) \
- ".section __ex_table,\"a\"\n" \
- " .align 8\n" \
- " .quad " #_fault "," #_target "\n" \
- ".previous\n"
-#endif
-
extern int memcpy_real(void *, void *, size_t);
extern void memcpy_absolute(void *, void *, size_t);
memcpy_absolute(&(dest), &__tmp, sizeof(__tmp)); \
}
-#endif /* __ASM_S390_PROCESSOR_H */
+/*
+ * Helper macro for exception table entries
+ */
+#define EX_TABLE(_fault, _target) \
+ ".section __ex_table,\"a\"\n" \
+ ".align 4\n" \
+ ".long (" #_fault ") - .\n" \
+ ".long (" #_target ") - .\n" \
+ ".previous\n"
+
+#else /* __ASSEMBLY__ */
+
+#define EX_TABLE(_fault, _target) \
+ .section __ex_table,"a" ; \
+ .align 4 ; \
+ .long (_fault) - . ; \
+ .long (_target) - . ; \
+ .previous
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_S390_PROCESSOR_H */
LONG_INSN_POPCNT,
LONG_INSN_RISBHG,
LONG_INSN_RISBLG,
+ LONG_INSN_RINEXT,
+ LONG_INSN_RIEMIT,
+ LONG_INSN_TABORT,
+ LONG_INSN_TBEGIN,
+ LONG_INSN_TBEGINC,
};
static char *long_insn_name[] = {
[LONG_INSN_LLGHRL] = "llghrl",
[LONG_INSN_POPCNT] = "popcnt",
[LONG_INSN_RISBHG] = "risbhg",
- [LONG_INSN_RISBLG] = "risblk",
+ [LONG_INSN_RISBLG] = "risblg",
+ [LONG_INSN_RINEXT] = "rinext",
+ [LONG_INSN_RIEMIT] = "riemit",
+ [LONG_INSN_TABORT] = "tabort",
+ [LONG_INSN_TBEGIN] = "tbegin",
+ [LONG_INSN_TBEGINC] = "tbeginc",
};
static struct insn opcode[] = {
{ "", 0, INSTR_INVALID }
};
+static struct insn opcode_aa[] = {
+#ifdef CONFIG_64BIT
+ { { 0, LONG_INSN_RINEXT }, 0x00, INSTR_RI_RI },
+ { "rion", 0x01, INSTR_RI_RI },
+ { "tric", 0x02, INSTR_RI_RI },
+ { "rioff", 0x03, INSTR_RI_RI },
+ { { 0, LONG_INSN_RIEMIT }, 0x04, INSTR_RI_RI },
+#endif
+ { "", 0, INSTR_INVALID }
+};
+
static struct insn opcode_b2[] = {
#ifdef CONFIG_64BIT
{ "sske", 0x2b, INSTR_RRF_M0RR },
{ "lpswe", 0xb2, INSTR_S_RD },
{ "srnmt", 0xb9, INSTR_S_RD },
{ "lfas", 0xbd, INSTR_S_RD },
+ { "etndg", 0xec, INSTR_RRE_R0 },
+ { { 0, LONG_INSN_TABORT }, 0xfc, INSTR_S_RD },
+ { "tend", 0xf8, INSTR_S_RD },
#endif
{ "stidp", 0x02, INSTR_S_RD },
{ "sck", 0x04, INSTR_S_RD },
{ "stfh", 0xcb, INSTR_RXY_RRRD },
{ "chf", 0xcd, INSTR_RXY_RRRD },
{ "clhf", 0xcf, INSTR_RXY_RRRD },
+ { "ntstg", 0x25, INSTR_RXY_RRRD },
#endif
{ "lrv", 0x1e, INSTR_RXY_RRRD },
{ "lrvh", 0x1f, INSTR_RXY_RRRD },
{ "mvhhi", 0x44, INSTR_SIL_RDI },
{ "mvhi", 0x4c, INSTR_SIL_RDI },
{ "mvghi", 0x48, INSTR_SIL_RDI },
+ { { 0, LONG_INSN_TBEGIN }, 0x60, INSTR_SIL_RDU },
+ { { 0, LONG_INSN_TBEGINC }, 0x61, INSTR_SIL_RDU },
#endif
{ "lasp", 0x00, INSTR_SSE_RDRD },
{ "tprot", 0x01, INSTR_SSE_RDRD },
{ "cliy", 0x55, INSTR_SIY_URD },
{ "oiy", 0x56, INSTR_SIY_URD },
{ "xiy", 0x57, INSTR_SIY_URD },
+ { "lric", 0x60, INSTR_RSY_RDRM },
+ { "stric", 0x61, INSTR_RSY_RDRM },
+ { "mric", 0x62, INSTR_RSY_RDRM },
{ "icmh", 0x80, INSTR_RSE_RURD },
{ "icmh", 0x80, INSTR_RSY_RURD },
{ "icmy", 0x81, INSTR_RSY_RURD },
case 0xa7:
table = opcode_a7;
break;
+ case 0xaa:
+ table = opcode_aa;
+ break;
case 0xb2:
table = opcode_b2;
break;
return NULL;
}
+ /**
+ * insn_to_mnemonic - decode an s390 instruction
+ * @instruction: instruction to decode
+ * @buf: buffer to fill with mnemonic
+ *
+ * Decode the instruction at @instruction and store the corresponding
+ * mnemonic into @buf.
+ * @buf is left unchanged if the instruction could not be decoded.
+ * Returns:
+ * %0 on success, %-ENOENT if the instruction was not found.
+ */
+ int insn_to_mnemonic(unsigned char *instruction, char buf[8])
+ {
+ struct insn *insn;
+
+ insn = find_insn(instruction);
+ if (!insn)
+ return -ENOENT;
+ if (insn->name[0] == '\0')
+ snprintf(buf, sizeof(buf), "%s",
+ long_insn_name[(int) insn->name[1]]);
+ else
+ snprintf(buf, sizeof(buf), "%.5s", insn->name);
+ return 0;
+ }
+ EXPORT_SYMBOL_GPL(insn_to_mnemonic);
+
static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
{
struct insn *insn;
}
printk("\n");
}
+
+void print_fn_code(unsigned char *code, unsigned long len)
+{
+ char buffer[64], *ptr;
+ int opsize, i;
+
+ while (len) {
+ ptr = buffer;
+ opsize = insn_length(*code);
+ ptr += sprintf(ptr, "%p: ", code);
+ for (i = 0; i < opsize; i++)
+ ptr += sprintf(ptr, "%02x", code[i]);
+ *ptr++ = '\t';
+ if (i < 4)
+ *ptr++ = '\t';
+ ptr += print_insn(ptr, code, (unsigned long) code);
+ *ptr++ = '\n';
+ *ptr++ = 0;
+ printk(buffer);
+ code += opsize;
+ len -= opsize;
+ }
+}
menuconfig VIRTUALIZATION
def_bool y
- prompt "Virtualization"
+ prompt "KVM"
---help---
Say Y here to get to see options for using your Linux host to run other
operating systems inside virtual machines (guests).
depends on HAVE_KVM && EXPERIMENTAL
select PREEMPT_NOTIFIERS
select ANON_INODES
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
#include <asm/sysinfo.h>
#include "gaccess.h"
#include "kvm-s390.h"
+ #include "trace.h"
static int handle_set_prefix(struct kvm_vcpu *vcpu)
{
kvm_s390_set_prefix(vcpu, address);
VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
+ trace_kvm_s390_handle_prefix(vcpu, 1, address);
out:
return 0;
}
}
VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
+ trace_kvm_s390_handle_prefix(vcpu, 0, address);
out:
return 0;
}
}
VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
+ trace_kvm_s390_handle_stap(vcpu, useraddr);
out:
return 0;
}
&facility_list, sizeof(facility_list));
if (rc == -EFAULT)
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- else
+ else {
VCPU_EVENT(vcpu, 5, "store facility list value %x",
facility_list);
+ trace_kvm_s390_handle_stfl(vcpu, facility_list);
+ }
return 0;
}
spin_unlock(&fi->lock);
/* deal with other level 3 hypervisors */
- if (stsi(mem, 3, 2, 2) == -ENOSYS)
+ if (stsi(mem, 3, 2, 2))
mem->count = 0;
if (mem->count < 8)
mem->count++;
mem = get_zeroed_page(GFP_KERNEL);
if (!mem)
goto out_fail;
- if (stsi((void *) mem, fc, sel1, sel2) == -ENOSYS)
+ if (stsi((void *) mem, fc, sel1, sel2))
goto out_mem;
break;
case 3:
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
goto out_mem;
}
+ trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
free_page(mem);
vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
vcpu->run->s.regs.gprs[0] = 0;
Say no to build a 32-bit kernel - formerly known as i386
config X86_32
- def_bool !64BIT
+ def_bool y
+ depends on !64BIT
select CLKSRC_I8253
config X86_64
- def_bool 64BIT
+ def_bool y
+ depends on 64BIT
select X86_DEV_DMA_OPS
### Arch settings
select HAVE_KRETPROBES
select HAVE_OPTPROBES
select HAVE_FTRACE_MCOUNT_RECORD
+ select HAVE_FENTRY if X86_64
select HAVE_C_RECORDMCOUNT
select HAVE_DYNAMIC_FTRACE
select HAVE_FUNCTION_TRACER
select HAVE_MIXED_BREAKPOINTS_REGS
select PERF_EVENTS
select HAVE_PERF_EVENTS_NMI
+ select HAVE_PERF_REGS
+ select HAVE_PERF_USER_STACK_DUMP
select ANON_INODES
select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
select HAVE_CMPXCHG_LOCAL if !M386
select KTIME_SCALAR if X86_32
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
+ select HAVE_RCU_USER_QS if X86_64
+ select HAVE_IRQ_TIME_ACCOUNTING
config INSTRUCTION_DECODER
- def_bool (KPROBES || PERF_EVENTS || UPROBES)
+ def_bool y
+ depends on KPROBES || PERF_EVENTS || UPROBES
config OUTPUT_FORMAT
string
bool
config NEED_DMA_MAP_STATE
- def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG)
+ def_bool y
+ depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG
config NEED_SG_DMA_LENGTH
def_bool y
config GENERIC_ISA_DMA
- def_bool ISA_DMA_API
+ def_bool y
+ depends on ISA_DMA_API
config GENERIC_BUG
def_bool y
bool
config ARCH_MAY_HAVE_PC_FDC
- def_bool ISA_DMA_API
+ def_bool y
+ depends on ISA_DMA_API
config RWSEM_GENERIC_SPINLOCK
- def_bool !X86_XADD
+ def_bool y
+ depends on !X86_XADD
config RWSEM_XCHGADD_ALGORITHM
- def_bool X86_XADD
+ def_bool y
+ depends on X86_XADD
config GENERIC_CALIBRATE_DELAY
def_bool y
source "arch/x86/xen/Kconfig"
- config KVM_CLOCK
- bool "KVM paravirtualized clock"
- select PARAVIRT
- select PARAVIRT_CLOCK
- ---help---
- Turning on this option will allow you to run a paravirtualized clock
- when running over the KVM hypervisor. Instead of relying on a PIT
- (or probably other) emulation by the underlying device model, the host
- provides the guest with timing infrastructure such as time of day, and
- system time
-
config KVM_GUEST
- bool "KVM Guest support"
+ bool "KVM Guest support (including kvmclock)"
+ select PARAVIRT
select PARAVIRT
+ select PARAVIRT_CLOCK
+ default y if PARAVIRT_GUEST
---help---
This option enables various optimizations for running under the KVM
- hypervisor.
+ hypervisor. It includes a paravirtualized clock, so that instead
+ of relying on a PIT (or probably other) emulation by the
+ underlying device model, the host provides the guest with
+ timing infrastructure such as time of day, and system time
source "arch/x86/lguest/Kconfig"
def_bool y if X86_64
---help---
Support for software bounce buffers used on x86-64 systems
- which don't have a hardware IOMMU (e.g. the current generation
- of Intel's x86-64 CPUs). Using this PCI devices which can only
- access 32-bits of memory can be used on systems with more than
- 3 GB of memory. If unsure, say Y.
+ which don't have a hardware IOMMU. Using this PCI devices
+ which can only access 32-bits of memory can be used on systems
+ with more than 3 GB of memory.
+ If unsure, say Y.
config IOMMU_HELPER
- def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
+ def_bool y
+ depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
-config IRQ_TIME_ACCOUNTING
- bool "Fine granularity task level IRQ time accounting"
- default n
- ---help---
- Select this option to enable fine granularity task irq time
- accounting. This is done by reading a timestamp on each
- transitions between softirq and hardirq state, so there can be a
- small performance impact.
-
- If in doubt, say N here.
-
source "kernel/Kconfig.preempt"
config X86_UP_APIC
config X86_MCE
bool "Machine Check / overheating reporting"
+ default y
---help---
Machine Check support allows the processor to notify the
kernel if it detects a problem (e.g. overheating, data corruption).
Say N otherwise.
config MICROCODE
- tristate "/dev/cpu/microcode - microcode support"
+ tristate "CPU microcode loading support"
select FW_LOADER
---help---
+
If you say Y here, you will be able to update the microcode on
certain Intel and AMD processors. The Intel support is for the
- IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
- Pentium 4, Xeon etc. The AMD support is for family 0x10 and
- 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
- You will obviously need the actual microcode binary data itself
- which is not shipped with the Linux kernel.
+ IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4,
+ Xeon etc. The AMD support is for families 0x10 and later. You will
+ obviously need the actual microcode binary data itself which is not
+ shipped with the Linux kernel.
This option selects the general module only, you need to select
at least one vendor specific module as well.
- To compile this driver as a module, choose M here: the
- module will be called microcode.
+ To compile this driver as a module, choose M here: the module
+ will be called microcode.
config MICROCODE_INTEL
- bool "Intel microcode patch loading support"
+ bool "Intel microcode loading support"
depends on MICROCODE
default MICROCODE
select FW_LOADER
<http://www.urbanmyth.org/microcode/>.
config MICROCODE_AMD
- bool "AMD microcode patch loading support"
+ bool "AMD microcode loading support"
depends on MICROCODE
select FW_LOADER
---help---
consumes more pagetable space per process.
config ARCH_PHYS_ADDR_T_64BIT
- def_bool X86_64 || X86_PAE
+ def_bool y
+ depends on X86_64 || X86_PAE
config ARCH_DMA_ADDR_T_64BIT
- def_bool X86_64 || HIGHMEM64G
+ def_bool y
+ depends on X86_64 || HIGHMEM64G
config DIRECT_GBPAGES
bool "Enable 1GB pages for kernel pagetables" if EXPERT
depends on ARCH_SPARSEMEM_ENABLE
config ARCH_MEMORY_PROBE
- def_bool X86_64
- depends on MEMORY_HOTPLUG
+ def_bool y
+ depends on X86_64 && MEMORY_HOTPLUG
config ARCH_PROC_KCORE_TEXT
def_bool y
If supported, this is a high bandwidth, cryptographically
secure hardware random number generator.
+config X86_SMAP
+ def_bool y
+ prompt "Supervisor Mode Access Prevention" if EXPERT
+ ---help---
+ Supervisor Mode Access Prevention (SMAP) is a security
+ feature in newer Intel processors. There is a small
+ performance cost if this enabled and turned on; there is
+ also a small increase in the kernel size if this is enabled.
+
+ If unsure, say Y.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
config PCI_CNB20LE_QUIRK
bool "Read CNB20LE Host Bridge Windows" if EXPERT
- default n
depends on PCI && EXPERIMENTAL
help
Read the PCI windows out of the CNB20LE host bridge. This allows
depends on IA32_EMULATION || X86_X32
select ARCH_WANT_OLD_COMPAT_IPC
+if COMPAT
config COMPAT_FOR_U64_ALIGNMENT
- def_bool COMPAT
- depends on X86_64
+ def_bool y
config SYSVIPC_COMPAT
def_bool y
- depends on COMPAT && SYSVIPC
+ depends on SYSVIPC
config KEYS_COMPAT
- bool
- depends on COMPAT && KEYS
- default y
+ def_bool y
+ depends on KEYS
+endif
endmenu
#include <linux/types.h>
#include <linux/ioctl.h>
+#define DE_VECTOR 0
+#define DB_VECTOR 1
+#define BP_VECTOR 3
+#define OF_VECTOR 4
+#define BR_VECTOR 5
+#define UD_VECTOR 6
+#define NM_VECTOR 7
+#define DF_VECTOR 8
+#define TS_VECTOR 10
+#define NP_VECTOR 11
+#define SS_VECTOR 12
+#define GP_VECTOR 13
+#define PF_VECTOR 14
+#define MF_VECTOR 16
+#define MC_VECTOR 18
+
/* Select x86 specific features in <linux/kvm.h> */
#define __KVM_HAVE_PIT
#define __KVM_HAVE_IOAPIC
#define __KVM_HAVE_DEBUGREGS
#define __KVM_HAVE_XSAVE
#define __KVM_HAVE_XCRS
+ #define __KVM_HAVE_READONLY_MEM
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
-#define DE_VECTOR 0
-#define DB_VECTOR 1
-#define BP_VECTOR 3
-#define OF_VECTOR 4
-#define BR_VECTOR 5
-#define UD_VECTOR 6
-#define NM_VECTOR 7
-#define DF_VECTOR 8
-#define TS_VECTOR 10
-#define NP_VECTOR 11
-#define SS_VECTOR 12
-#define GP_VECTOR 13
-#define PF_VECTOR 14
-#define MF_VECTOR 16
-#define MC_VECTOR 18
-
#define SELECTOR_TI_MASK (1 << 2)
#define SELECTOR_RPL_MASK 0x03
union kvm_mmu_page_role base_role;
bool direct_map;
+ /*
+ * Bitmap; bit set = permission fault
+ * Byte index: page fault error code [4:1]
+ * Bit index: pte permissions in ACC_* format
+ */
+ u8 permissions[16];
+
u64 *pae_root;
u64 *lm_root;
u64 rsvd_bits_mask[2][4];
+ /*
+ * Bitmap: bit set = last pte in walk
+ * index[0:1]: level (zero-based)
+ * index[2]: pte.ps
+ */
+ u8 last_pte_bitmap;
+
bool nx;
u64 pdptrs[4]; /* pae */
struct x86_emulate_ctxt emulate_ctxt;
bool emulate_regs_need_sync_to_vcpu;
bool emulate_regs_need_sync_from_vcpu;
+ int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
unsigned int hw_tsc_khz;
unsigned int time_offset;
struct page *time_page;
+ /* set guest stopped flag in pvclock flags field */
+ bool pvclock_set_guest_stopped_request;
struct {
u64 msr_val;
unsigned long dr6;
unsigned long dr7;
unsigned long eff_db[KVM_NR_DB_REGS];
+ unsigned long guest_debug_dr7;
u64 mcg_cap;
u64 mcg_status;
};
struct kvm_lpage_info {
- unsigned long rmap_pde;
int write_count;
};
struct kvm_arch_memory_slot {
+ unsigned long *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};
+ struct kvm_apic_map {
+ struct rcu_head rcu;
+ u8 ldr_bits;
+ /* fields bellow are used to decode ldr values in different modes */
+ u32 cid_shift, cid_mask, lid_mask;
+ struct kvm_lapic *phys_map[256];
+ /* first index is cluster id second is cpu id in a cluster */
+ struct kvm_lapic *logical_map[16][16];
+ };
+
struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
int vapics_in_nmi_mode;
+ struct mutex apic_map_lock;
+ struct kvm_apic_map *apic_map;
unsigned int tss_addr;
struct page *apic_access_page;
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
- void (*set_guest_debug)(struct kvm_vcpu *vcpu,
- struct kvm_guest_debug *dbg);
+ void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
- obj-$(CONFIG_KVM_GUEST) += kvm.o
- obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
+ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_OF) += devicetree.o
obj-$(CONFIG_UPROBES) += uprobes.o
+obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
initmem_init();
memblock_find_dma_reserve();
- #ifdef CONFIG_KVM_CLOCK
+ #ifdef CONFIG_KVM_GUEST
kvmclock_init();
#endif
- x86_init.paging.pagetable_setup_start(swapper_pg_dir);
- paging_init();
- x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+ x86_init.paging.pagetable_init();
if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
+ extern const ulong vmx_return;
+
#define NR_AUTOLOAD_MSRS 8
#define VMCS02_POOL_SIZE 1
struct {
int vm86_active;
ulong save_rflags;
+ struct kvm_segment segs[8];
+ } rmode;
+ struct {
+ u32 bitmask; /* 4 bits per segment (1 bit per field) */
struct kvm_save_segment {
u16 selector;
unsigned long base;
u32 limit;
u32 ar;
- } tr, es, ds, fs, gs;
- } rmode;
- struct {
- u32 bitmask; /* 4 bits per segment (1 bit per field) */
- struct kvm_save_segment seg[8];
+ } seg[8];
} segment_cache;
int vpid;
bool emulation_required;
#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
[number##_HIGH] = VMCS12_OFFSET(name)+4
- static unsigned short vmcs_field_to_offset_table[] = {
+ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
{
struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
+ if (is_error_page(page))
return NULL;
- }
+
return page;
}
.ar_bytes = GUEST_##seg##_AR_BYTES, \
}
- static struct kvm_vmx_segment_field {
+ static const struct kvm_vmx_segment_field {
unsigned selector;
unsigned base;
unsigned limit;
guest_efer = vmx->vcpu.arch.efer;
/*
- * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+ * NX is emulated; LMA and LME handled by hardware; SCE meaningless
* outside long mode
*/
ignore_bits = EFER_NX | EFER_SCE;
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
- if (user_has_fpu())
- clts();
+ /*
+ * If the FPU is not active (through the host task or
+ * the guest vcpu), then restore the cr0.TS bit.
+ */
+ if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
+ stts();
load_gdt(&__get_cpu_var(host_gdt));
}
#endif
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
- CPU_BASED_RDPMC_EXITING |
+ CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
/*
* We can allow some features even when not supported by the
}
}
- static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
- {
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
- else
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
-
- update_exception_bitmap(vcpu);
- }
-
static __init int cpu_has_kvm_support(void)
{
return cpu_has_vmx();
free_kvm_area();
}
- static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
+ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save)
{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ struct kvm_segment tmp = *save;
- if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
- vmcs_write16(sf->selector, save->selector);
- vmcs_writel(sf->base, save->base);
- vmcs_write32(sf->limit, save->limit);
- vmcs_write32(sf->ar_bytes, save->ar);
- } else {
- u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
- << AR_DPL_SHIFT;
- vmcs_write32(sf->ar_bytes, 0x93 | dpl);
+ if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
+ tmp.base = vmcs_readl(sf->base);
+ tmp.selector = vmcs_read16(sf->selector);
+ tmp.s = 1;
}
+ vmx_set_segment(vcpu, &tmp, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
vmx_segment_cache_clear(vmx);
- vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
- vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
- vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
- vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
+ vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
if (emulate_invalid_guest_state)
return;
- fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
- fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
- fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
- fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
+ fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+ fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
vmx_segment_cache_clear(vmx);
return kvm->arch.tss_addr;
}
- static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
+ static void fix_rmode_seg(int seg, struct kvm_segment *save)
{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
- save->selector = vmcs_read16(sf->selector);
- save->base = vmcs_readl(sf->base);
- save->limit = vmcs_read32(sf->limit);
- save->ar = vmcs_read32(sf->ar_bytes);
vmcs_write16(sf->selector, save->base >> 4);
vmcs_write32(sf->base, save->base & 0xffff0);
vmcs_write32(sf->limit, 0xffff);
if (enable_unrestricted_guest)
return;
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+
vmx->emulation_required = 1;
vmx->rmode.vm86_active = 1;
+
/*
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Call it here with phys address pointing 16M below 4G.
vmx_segment_cache_clear(vmx);
- vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
- vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
-
- vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-
- vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_save_segment *save;
u32 ar;
if (vmx->rmode.vm86_active
&& (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
|| seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
- || seg == VCPU_SREG_GS)
- && !emulate_invalid_guest_state) {
- switch (seg) {
- case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
- case VCPU_SREG_ES: save = &vmx->rmode.es; break;
- case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
- case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
- case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
- default: BUG();
- }
- var->selector = save->selector;
- var->base = save->base;
- var->limit = save->limit;
- ar = save->ar;
+ || seg == VCPU_SREG_GS)) {
+ *var = vmx->rmode.segs[seg];
if (seg == VCPU_SREG_TR
|| var->selector == vmx_read_guest_seg_selector(vmx, seg))
- goto use_saved_rmode_seg;
+ return;
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ return;
}
var->base = vmx_read_guest_seg_base(vmx, seg);
var->limit = vmx_read_guest_seg_limit(vmx, seg);
var->selector = vmx_read_guest_seg_selector(vmx, seg);
ar = vmx_read_guest_seg_ar(vmx, seg);
- use_saved_rmode_seg:
if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
ar = 0;
var->type = ar & 15;
struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
vmx_segment_cache_clear(vmx);
if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
vmcs_write16(sf->selector, var->selector);
- vmx->rmode.tr.selector = var->selector;
- vmx->rmode.tr.base = var->base;
- vmx->rmode.tr.limit = var->limit;
- vmx->rmode.tr.ar = vmx_segment_access_rights(var);
+ vmx->rmode.segs[VCPU_SREG_TR] = *var;
return;
}
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
if (vmx->rmode.vm86_active && var->s) {
+ vmx->rmode.segs[seg] = *var;
/*
* Hack real-mode segments into vm86 compatibility.
*/
* qemu binaries.
* IA32 arch specifies that at the time of processor reset the
* "Accessed" bit in the AR field of segment registers is 1. And qemu
- * is setting it to 0 in the usedland code. This causes invalid guest
+ * is setting it to 0 in the userland code. This causes invalid guest
* state vmexit when "unrestricted guest" mode is turned on.
* Fix for this setup issue in cpu_reset is being pushed in the qemu
* tree. Newer qemu binaries with that qemu fix would not need this
vmcs_readl(GUEST_CS_BASE) >> 4);
break;
case VCPU_SREG_ES:
- fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
- break;
case VCPU_SREG_DS:
- fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
- break;
case VCPU_SREG_GS:
- fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
- break;
case VCPU_SREG_FS:
- fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+ fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
break;
case VCPU_SREG_SS:
vmcs_write16(GUEST_SS_SELECTOR,
if (var.base != (var.selector << 4))
return false;
- if (var.limit != 0xffff)
+ if (var.limit < 0xffff)
return false;
- if (ar != 0xf3)
+ if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3)
return false;
return true;
static void seg_setup(int seg)
{
- struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
unsigned int ar;
vmcs_write16(sf->selector, 0);
static int alloc_apic_access_page(struct kvm *kvm)
{
+ struct page *page;
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
if (r)
goto out;
- kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+ page = gfn_to_page(kvm, 0xfee00);
+ if (is_error_page(page)) {
+ r = -EFAULT;
+ goto out;
+ }
+
+ kvm->arch.apic_access_page = page;
out:
mutex_unlock(&kvm->slots_lock);
return r;
static int alloc_identity_pagetable(struct kvm *kvm)
{
+ struct page *page;
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
if (r)
goto out;
- kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
- kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
+ page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ r = -EFAULT;
+ goto out;
+ }
+
+ kvm->arch.ept_identity_pagetable = page;
out:
mutex_unlock(&kvm->slots_lock);
return r;
unsigned long tmpl;
struct desc_ptr dt;
- vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
+ vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */
vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
native_store_idt(&dt);
vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
- asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
- vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
+ vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
kvm_rip_write(vcpu, 0);
kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
- vmcs_writel(GUEST_DR7, 0x400);
-
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
hypercall[2] = 0xc1;
}
- /* called to set cr0 as approriate for a mov-to-cr0 exit. */
+ /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
if (to_vmx(vcpu)->nested.vmxon &&
vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
- };
+ }
break;
case 2: /* clts */
handle_clts(vcpu);
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
* to be done to userspace and return 0.
*/
- static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EXCEPTION_NMI] = handle_exception,
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
[EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
msrs[i].host);
}
- #ifdef CONFIG_X86_64
- #define R "r"
- #define Q "q"
- #else
- #define R "e"
- #define Q "l"
- #endif
-
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long debugctlmsr;
if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
vmx_set_interrupt_shadow(vcpu, 0);
atomic_switch_perf_msrs(vmx);
+ debugctlmsr = get_debugctlmsr();
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
- "push %%"R"dx; push %%"R"bp;"
- "push %%"R"cx \n\t" /* placeholder for guest rcx */
- "push %%"R"cx \n\t"
- "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+ "push %%" _ASM_DX "; push %%" _ASM_BP ";"
+ "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
+ "push %%" _ASM_CX " \n\t"
+ "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
- "mov %%"R"sp, %c[host_rsp](%0) \n\t"
+ "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload cr2 if changed */
- "mov %c[cr2](%0), %%"R"ax \n\t"
- "mov %%cr2, %%"R"dx \n\t"
- "cmp %%"R"ax, %%"R"dx \n\t"
+ "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
+ "mov %%cr2, %%" _ASM_DX " \n\t"
+ "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
"je 2f \n\t"
- "mov %%"R"ax, %%cr2 \n\t"
+ "mov %%" _ASM_AX", %%cr2 \n\t"
"2: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
- "mov %c[rax](%0), %%"R"ax \n\t"
- "mov %c[rbx](%0), %%"R"bx \n\t"
- "mov %c[rdx](%0), %%"R"dx \n\t"
- "mov %c[rsi](%0), %%"R"si \n\t"
- "mov %c[rdi](%0), %%"R"di \n\t"
- "mov %c[rbp](%0), %%"R"bp \n\t"
+ "mov %c[rax](%0), %%" _ASM_AX " \n\t"
+ "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
+ "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
+ "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
+ "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
+ "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
#ifdef CONFIG_X86_64
"mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t"
"mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t"
#endif
- "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+ "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
/* Enter guest mode */
- "jne .Llaunched \n\t"
+ "jne 1f \n\t"
__ex(ASM_VMX_VMLAUNCH) "\n\t"
- "jmp .Lkvm_vmx_return \n\t"
- ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
- ".Lkvm_vmx_return: "
+ "jmp 2f \n\t"
+ "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
+ "2: "
/* Save guest registers, load host registers, keep flags */
- "mov %0, %c[wordsize](%%"R"sp) \n\t"
+ "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
"pop %0 \n\t"
- "mov %%"R"ax, %c[rax](%0) \n\t"
- "mov %%"R"bx, %c[rbx](%0) \n\t"
- "pop"Q" %c[rcx](%0) \n\t"
- "mov %%"R"dx, %c[rdx](%0) \n\t"
- "mov %%"R"si, %c[rsi](%0) \n\t"
- "mov %%"R"di, %c[rdi](%0) \n\t"
- "mov %%"R"bp, %c[rbp](%0) \n\t"
+ "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
+ "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
+ __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
+ "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
+ "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
+ "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
+ "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
#endif
- "mov %%cr2, %%"R"ax \n\t"
- "mov %%"R"ax, %c[cr2](%0) \n\t"
+ "mov %%cr2, %%" _ASM_AX " \n\t"
+ "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
- "pop %%"R"bp; pop %%"R"dx \n\t"
+ "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
"setbe %c[fail](%0) \n\t"
+ ".pushsection .rodata \n\t"
+ ".global vmx_return \n\t"
+ "vmx_return: " _ASM_PTR " 2b \n\t"
+ ".popsection"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"
- , R"ax", R"bx", R"di", R"si"
#ifdef CONFIG_X86_64
+ , "rax", "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+ #else
+ , "eax", "ebx", "edi", "esi"
#endif
);
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+ if (debugctlmsr)
+ update_debugctlmsr(debugctlmsr);
+
#ifndef CONFIG_X86_64
/*
* The sysexit path does not restore ds/es, so we must set them to
vmx_complete_interrupts(vmx);
}
- #undef R
- #undef Q
-
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
/* Exposing INVPCID only when PCID is exposed */
best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
if (vmx_invpcid_supported() &&
- best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
+ best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
guest_cpuid_has_pcid(vcpu)) {
exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
exec_control);
if (best)
- best->ecx &= ~bit(X86_FEATURE_INVPCID);
+ best->ebx &= ~bit(X86_FEATURE_INVPCID);
}
}
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .set_guest_debug = set_guest_debug,
+ .update_db_bp_intercept = update_exception_bitmap,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.get_segment_base = vmx_get_segment_base,
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
- if (irqchip_in_kernel(vcpu->kvm))
- return vcpu->arch.apic_base;
- else
- return vcpu->arch.apic_base;
+ return vcpu->arch.apic_base;
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
{
/* TODO: reserve bits check */
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_lapic_set_base(vcpu, data);
- else
- vcpu->arch.apic_base = data;
+ kvm_lapic_set_base(vcpu, data);
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
+ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+ {
+ unsigned long dr7;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ dr7 = vcpu->arch.guest_debug_dr7;
+ else
+ dr7 = vcpu->arch.dr7;
+ kvm_x86_ops->set_dr7(vcpu, dr7);
+ vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
+ }
+
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
switch (dr) {
if (val & 0xffffffff00000000ULL)
return -1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
- kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
- vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
- }
+ kvm_update_dr7(vcpu);
break;
}
static unsigned num_msrs_to_save;
- static u32 emulated_msrs[] = {
+ static const u32 emulated_msrs[] = {
MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
* For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match
- * exact software computaion in compute_guest_tsc()
+ * exact software computation in compute_guest_tsc()
*
* These values are tracked in kvm->arch.cur_xxx variables.
*/
unsigned long this_tsc_khz;
s64 kernel_ns, max_kernel_ns;
u64 tsc_timestamp;
+ u8 pvclock_flags;
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_kernel_ns = kernel_ns;
vcpu->last_guest_tsc = tsc_timestamp;
- vcpu->hv_clock.flags = 0;
+
+ pvclock_flags = 0;
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ vcpu->hv_clock.flags = pvclock_flags;
/*
* The interface expects us to write an even number signaling that the
{
gpa_t gpa = data & ~0x3f;
- /* Bits 2:5 are resrved, Should be zero */
+ /* Bits 2:5 are reserved, Should be zero */
if (data & 0x3c)
return 1;
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
- if (is_error_page(vcpu->arch.time_page)) {
- kvm_release_page_clean(vcpu->arch.time_page);
+ if (is_error_page(vcpu->arch.time_page))
vcpu->arch.time_page = NULL;
- }
+
break;
}
case MSR_KVM_ASYNC_PF_EN:
* Ignore all writes to this no longer documented MSR.
* Writes are only relevant for old K7 processors,
* all pre-dating SVM, but a recommended workaround from
- * AMD for these chips. It is possible to speicify the
+ * AMD for these chips. It is possible to specify the
* affected processor models on the command line, hence
* the need to ignore the workaround.
*/
case MSR_KVM_STEAL_TIME:
data = vcpu->arch.st.msr_val;
break;
+ case MSR_KVM_PV_EOI_EN:
+ data = vcpu->arch.pv_eoi.msr_val;
+ break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
+ case KVM_CAP_READONLY_MEM:
+ case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
- kvm_apic_post_state_restore(vcpu);
+ kvm_apic_post_state_restore(vcpu, s);
update_cr8_intercept(vcpu);
return 0;
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
- if (irq->irq < 0 || irq->irq >= 256)
+ if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
return -EINVAL;
if (irqchip_in_kernel(vcpu->kvm))
return -ENXIO;
*/
static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
{
- struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
if (!vcpu->arch.time_page)
return -EINVAL;
- src->flags |= PVCLOCK_GUEST_STOPPED;
- mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
+ vcpu->arch.pvclock_set_guest_stopped_request = true;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
return 0;
}
if (!kvm->arch.vpit)
return -ENXIO;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
- kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+ kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
return r;
}
+ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+ {
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level);
+ return 0;
+ }
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
create_pit_unlock:
mutex_unlock(&kvm->slots_lock);
break;
- case KVM_IRQ_LINE_STATUS:
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- r = -ENXIO;
- if (irqchip_in_kernel(kvm)) {
- __s32 status;
- status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
- irq_event.status = status;
- if (copy_to_user(argp, &irq_event,
- sizeof irq_event))
- goto out;
- }
- r = 0;
- }
- break;
- }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip *chip;
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ | (write ? PFERR_WRITE_MASK : 0);
- if (vcpu_match_mmio_gva(vcpu, gva) &&
- check_write_user_access(vcpu, write, access,
- vcpu->arch.access)) {
+ if (vcpu_match_mmio_gva(vcpu, gva)
+ && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
return 1;
}
- if (write)
- access |= PFERR_WRITE_MASK;
-
*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
if (*gpa == UNMAPPED_GVA)
return X86EMUL_CONTINUE;
}
- static struct read_write_emulator_ops read_emultor = {
+ static const struct read_write_emulator_ops read_emultor = {
.read_write_prepare = read_prepare,
.read_write_emulate = read_emulate,
.read_write_mmio = vcpu_mmio_read,
.read_write_exit_mmio = read_exit_mmio,
};
- static struct read_write_emulator_ops write_emultor = {
+ static const struct read_write_emulator_ops write_emultor = {
.read_write_emulate = write_emulate,
.read_write_mmio = write_mmio,
.read_write_exit_mmio = write_exit_mmio,
unsigned int bytes,
struct x86_exception *exception,
struct kvm_vcpu *vcpu,
- struct read_write_emulator_ops *ops)
+ const struct read_write_emulator_ops *ops)
{
gpa_t gpa;
int handled, ret;
int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
void *val, unsigned int bytes,
struct x86_exception *exception,
- struct read_write_emulator_ops *ops)
+ const struct read_write_emulator_ops *ops)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
goto emul_write;
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- if (is_error_page(page)) {
- kvm_release_page_clean(page);
+ if (is_error_page(page))
goto emul_write;
- }
kaddr = kmap_atomic(page);
kaddr += offset_in_page(gpa);
kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
}
- static struct x86_emulate_ops emulate_ops = {
+ static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
+ {
+ return kvm_register_read(emul_to_vcpu(ctxt), reg);
+ }
+
+ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
+ {
+ kvm_register_write(emul_to_vcpu(ctxt), reg, val);
+ }
+
+ static const struct x86_emulate_ops emulate_ops = {
+ .read_gpr = emulator_read_gpr,
+ .write_gpr = emulator_write_gpr,
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
.fetch = kvm_fetch_guest_virt,
.get_cpuid = emulator_get_cpuid,
};
- static void cache_all_regs(struct kvm_vcpu *vcpu)
- {
- kvm_register_read(vcpu, VCPU_REGS_RAX);
- kvm_register_read(vcpu, VCPU_REGS_RSP);
- kvm_register_read(vcpu, VCPU_REGS_RIP);
- vcpu->arch.regs_dirty = ~0;
- }
-
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
kvm_queue_exception(vcpu, ctxt->exception.vector);
}
- static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
- const unsigned long *regs)
+ static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
{
memset(&ctxt->twobyte, 0,
- (void *)&ctxt->regs - (void *)&ctxt->twobyte);
- memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
+ (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
ctxt->fetch.start = 0;
ctxt->fetch.end = 0;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
- /*
- * TODO: fix emulate.c to use guest_read/write_register
- * instead of direct ->regs accesses, can save hundred cycles
- * on Intel for instructions that don't read/change RSP, for
- * for example.
- */
- cache_all_regs(vcpu);
-
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu);
X86EMUL_MODE_PROT16;
ctxt->guest_mode = is_guest_mode(vcpu);
- init_decode_cache(ctxt, vcpu->arch.regs);
+ init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
return EMULATE_FAIL;
ctxt->eip = ctxt->_eip;
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa;
+ pfn_t pfn;
if (tdp_enabled)
return false;
/*
* if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-entetr the
+ * and it failed try to unshadow page and re-enter the
* guest to let CPU execute the instruction.
*/
if (kvm_mmu_unprotect_page_virt(vcpu, gva))
if (gpa == UNMAPPED_GVA)
return true; /* let cpu generate fault */
- if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+ /*
+ * Do not retry the unhandleable instruction if it faults on the
+ * readonly host memory, otherwise it will goto a infinite loop:
+ * retry instruction -> write #PF -> emulation fail -> retry
+ * instruction -> ...
+ */
+ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+ if (!is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
return true;
+ }
return false;
}
return true;
}
+ static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
+ static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
int emulation_type,
changes registers values during IO operation */
if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
- memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
+ emulator_invalidate_register_cache(ctxt);
}
restart:
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in)
vcpu->arch.pio.count = 0;
- else
+ else {
writeback = false;
+ vcpu->arch.complete_userspace_io = complete_emulated_pio;
+ }
r = EMULATE_DO_MMIO;
} else if (vcpu->mmio_needed) {
if (!vcpu->mmio_is_write)
writeback = false;
r = EMULATE_DO_MMIO;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
} else if (r == EMULATION_RESTART)
goto restart;
else
toggle_interruptibility(vcpu, ctxt->interruptibility);
kvm_set_rflags(vcpu, ctxt->eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
} else
if (cpu_has_xsave)
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ kvm_lapic_init();
return 0;
out:
!kvm_event_needs_reinjection(vcpu);
}
-static void vapic_enter(struct kvm_vcpu *vcpu)
+static int vapic_enter(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct page *page;
if (!apic || !apic->vapic_addr)
- return;
+ return 0;
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return -EFAULT;
vcpu->arch.apic->vapic_page = page;
+ return 0;
}
static void vapic_exit(struct kvm_vcpu *vcpu)
}
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- vapic_enter(vcpu);
+ r = vapic_enter(vcpu);
+ if (r) {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ return r;
+ }
r = 1;
while (r > 0) {
return r;
}
+ static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
+ {
+ int r;
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r != EMULATE_DONE)
+ return 0;
+ return 1;
+ }
+
+ static int complete_emulated_pio(struct kvm_vcpu *vcpu)
+ {
+ BUG_ON(!vcpu->arch.pio.count);
+
+ return complete_emulated_io(vcpu);
+ }
+
/*
* Implements the following, as a state machine:
*
* copy data
* exit
*/
- static int complete_mmio(struct kvm_vcpu *vcpu)
+ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
struct kvm_mmio_fragment *frag;
- int r;
- if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
- return 1;
+ BUG_ON(!vcpu->mmio_needed);
- if (vcpu->mmio_needed) {
- /* Complete previous fragment */
- frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
- if (!vcpu->mmio_is_write)
- memcpy(frag->data, run->mmio.data, frag->len);
- if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
- vcpu->mmio_needed = 0;
- if (vcpu->mmio_is_write)
- return 1;
- vcpu->mmio_read_completed = 1;
- goto done;
- }
- /* Initiate next fragment */
- ++frag;
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = frag->gpa;
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
+ if (!vcpu->mmio_is_write)
+ memcpy(frag->data, run->mmio.data, frag->len);
+ if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
if (vcpu->mmio_is_write)
- memcpy(run->mmio.data, frag->data, frag->len);
- run->mmio.len = frag->len;
- run->mmio.is_write = vcpu->mmio_is_write;
- return 0;
-
- }
- done:
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r != EMULATE_DONE)
- return 0;
- return 1;
+ return 1;
+ vcpu->mmio_read_completed = 1;
+ return complete_emulated_io(vcpu);
+ }
+ /* Initiate next fragment */
+ ++frag;
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = frag->gpa;
+ if (vcpu->mmio_is_write)
+ memcpy(run->mmio.data, frag->data, frag->len);
+ run->mmio.len = frag->len;
+ run->mmio.is_write = vcpu->mmio_is_write;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ return 0;
}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
}
}
- r = complete_mmio(vcpu);
- if (r <= 0)
- goto out;
+ if (unlikely(vcpu->arch.complete_userspace_io)) {
+ int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+ vcpu->arch.complete_userspace_io = NULL;
+ r = cui(vcpu);
+ if (r <= 0)
+ goto out;
+ } else
+ WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
r = __vcpu_run(vcpu);
/*
* We are here if userspace calls get_regs() in the middle of
* instruction emulation. Registers state needs to be copied
- * back from emulation context to vcpu. Usrapace shouldn't do
+ * back from emulation context to vcpu. Userspace shouldn't do
* that usually, but some bad designed PV devices (vmware
* backdoor interface) need this to work
*/
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+ emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
}
regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
if (ret)
return EMULATE_FAIL;
- memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
- max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+ max_bits = KVM_NR_INTERRUPTS;
pending_vec = find_first_bit(
(const unsigned long *)sregs->interrupt_bitmap, max_bits);
if (pending_vec < max_bits) {
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
for (i = 0; i < KVM_NR_DB_REGS; ++i)
vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
- vcpu->arch.switch_db_regs =
- (dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+ vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
} else {
for (i = 0; i < KVM_NR_DB_REGS; i++)
vcpu->arch.eff_db[i] = vcpu->arch.db[i];
- vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
}
+ kvm_update_dr7(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops->set_guest_debug(vcpu, dbg);
+ kvm_x86_ops->update_db_bp_intercept(vcpu);
r = 0;
*/
kvm_put_guest_xcr0(vcpu);
vcpu->guest_fpu_loaded = 1;
- unlazy_fpu(current);
+ __kernel_fpu_begin();
fpu_restore_checking(&vcpu->arch.guest_fpu);
trace_kvm_fpu(1);
}
vcpu->guest_fpu_loaded = 0;
fpu_save_init(&vcpu->arch.guest_fpu);
+ __kernel_fpu_end();
++vcpu->stat.fpu_reload;
kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
trace_kvm_fpu(0);
int r;
vcpu->arch.mtrr_state.have_fixed = 1;
- vcpu_load(vcpu);
+ r = vcpu_load(vcpu);
+ if (r)
+ return r;
r = kvm_arch_vcpu_reset(vcpu);
if (r == 0)
r = kvm_mmu_setup(vcpu);
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ int r;
vcpu->arch.apf.msr_val = 0;
- vcpu_load(vcpu);
+ r = vcpu_load(vcpu);
+ BUG_ON(r);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
- vcpu->arch.switch_db_regs = 0;
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
vcpu->arch.dr6 = DR6_FIXED_1;
vcpu->arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.apf.msr_val = 0;
* as we reset last_host_tsc on all VCPUs to stop this from being
* called multiple times (one for each physical CPU bringup).
*
- * Platforms with unnreliable TSCs don't have to deal with this, they
+ * Platforms with unreliable TSCs don't have to deal with this, they
* will be compensated by the logic in vcpu_load, which sets the TSC to
* catchup mode. This will catchup all VCPUs to real time, but cannot
* guarantee that they stay in perfect synchronization.
return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
}
+ struct static_key kvm_no_apic_vcpu __read_mostly;
+
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
- }
+ } else
+ static_key_slow_inc(&kvm_no_apic_vcpu);
vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
GFP_KERNEL);
kvm_mmu_destroy(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
+ if (!irqchip_in_kernel(vcpu->kvm))
+ static_key_slow_dec(&kvm_no_apic_vcpu);
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+ /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+ set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ &kvm->arch.irq_sources_bitmap);
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+ mutex_init(&kvm->arch.apic_map_lock);
return 0;
}
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
- vcpu_load(vcpu);
+ int r;
+ r = vcpu_load(vcpu);
+ BUG_ON(r);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
}
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
+ kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
}
void kvm_arch_free_memslot(struct kvm_memory_slot *free,
{
int i;
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
- kvm_kvfree(free->arch.lpage_info[i]);
- free->arch.lpage_info[i] = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
+ kvm_kvfree(free->arch.rmap[i]);
+ free->arch.rmap[i] = NULL;
+ }
+ if (i == 0)
+ continue;
+
+ if (!dont || free->arch.lpage_info[i - 1] !=
+ dont->arch.lpage_info[i - 1]) {
+ kvm_kvfree(free->arch.lpage_info[i - 1]);
+ free->arch.lpage_info[i - 1] = NULL;
}
}
}
{
int i;
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
unsigned long ugfn;
int lpages;
- int level = i + 2;
+ int level = i + 1;
lpages = gfn_to_index(slot->base_gfn + npages - 1,
slot->base_gfn, level) + 1;
- slot->arch.lpage_info[i] =
- kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
- if (!slot->arch.lpage_info[i])
+ slot->arch.rmap[i] =
+ kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
+ if (!slot->arch.rmap[i])
+ goto out_free;
+ if (i == 0)
+ continue;
+
+ slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
+ sizeof(*slot->arch.lpage_info[i - 1]));
+ if (!slot->arch.lpage_info[i - 1])
goto out_free;
if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i][0].write_count = 1;
+ slot->arch.lpage_info[i - 1][0].write_count = 1;
if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
- slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+ slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
ugfn = slot->userspace_addr >> PAGE_SHIFT;
/*
* If the gfn and userspace address are not aligned wrt each
unsigned long j;
for (j = 0; j < lpages; ++j)
- slot->arch.lpage_info[i][j].write_count = 1;
+ slot->arch.lpage_info[i - 1][j].write_count = 1;
}
}
return 0;
out_free:
- for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- kvm_kvfree(slot->arch.lpage_info[i]);
- slot->arch.lpage_info[i] = NULL;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ kvm_kvfree(slot->arch.rmap[i]);
+ slot->arch.rmap[i] = NULL;
+ if (i == 0)
+ continue;
+
+ kvm_kvfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
}
return -ENOMEM;
}
map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
- *x86 needs to hanlde !user_alloc case.
+ *x86 needs to handle !user_alloc case.
*/
if (!user_alloc) {
- if (npages && !old.rmap) {
+ if (npages && !old.npages) {
unsigned long userspace_addr;
userspace_addr = vm_mmap(NULL, 0,
int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
- if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
+ if (!user_alloc && !old.user_alloc && old.npages && !npages) {
int ret;
ret = vm_munmap(old.userspace_addr,
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
kvm_mmu_slot_remove_write_access(kvm, mem->slot);
spin_unlock(&kvm->mmu_lock);
+ /*
+ * If memory slot is created, or moved, we need to clear all
+ * mmio sptes.
+ */
+ if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
+ kvm_mmu_zap_all(kvm);
+ kvm_reload_remote_mmus(kvm);
+ }
}
- void kvm_arch_flush_shadow(struct kvm *kvm)
+ void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
kvm_mmu_zap_all(kvm);
kvm_reload_remote_mmus(kvm);
}
+ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+ {
+ kvm_arch_flush_shadow_all(kvm);
+ }
+
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/ratelimit.h>
+ #include <linux/err.h>
#include <asm/signal.h>
#include <linux/kvm.h>
#define KVM_MMIO_SIZE 8
#endif
+ /*
+ * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
+ * in kvm, other bits are visible for userspace which are defined in
+ * include/linux/kvm_h.
+ */
+ #define KVM_MEMSLOT_INVALID (1UL << 16)
+
/*
* If we support unaligned MMIO, at most one fragment will be split into two:
*/
#define KVM_MAX_MMIO_FRAGMENTS \
(KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
+ /*
+ * For the normal pfn, the highest 12 bits should be zero,
+ * so we can mask these bits to indicate the error.
+ */
+ #define KVM_PFN_ERR_MASK (0xfffULL << 52)
+
+ #define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK)
+ #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
+ #define KVM_PFN_ERR_BAD (KVM_PFN_ERR_MASK + 2)
+ #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 3)
+
+ static inline bool is_error_pfn(pfn_t pfn)
+ {
+ return !!(pfn & KVM_PFN_ERR_MASK);
+ }
+
+ static inline bool is_noslot_pfn(pfn_t pfn)
+ {
+ return pfn == KVM_PFN_ERR_BAD;
+ }
+
+ static inline bool is_invalid_pfn(pfn_t pfn)
+ {
+ return !is_noslot_pfn(pfn) && is_error_pfn(pfn);
+ }
+
+ #define KVM_HVA_ERR_BAD (PAGE_OFFSET)
+ #define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE)
+
+ static inline bool kvm_is_error_hva(unsigned long addr)
+ {
+ return addr >= PAGE_OFFSET;
+ }
+
+ #define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT))
+
+ static inline bool is_error_page(struct page *page)
+ {
+ return IS_ERR(page);
+ }
+
/*
* vcpu->requests bit members
*/
#define KVM_REQ_PMU 16
#define KVM_REQ_PMI 17
- #define KVM_USERSPACE_IRQ_SOURCE_ID 0
+ #define KVM_USERSPACE_IRQ_SOURCE_ID 0
+ #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
struct kvm;
struct kvm_vcpu;
} async_pf;
#endif
+ #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+ /*
+ * Cpu relax intercept or pause loop exit optimization
+ * in_spin_loop: set when a vcpu does a pause loop exit
+ * or cpu relax intercepted.
+ * dy_eligible: indicates whether vcpu is eligible for directed yield.
+ */
+ struct {
+ bool in_spin_loop;
+ bool dy_eligible;
+ } spin_loop;
+ #endif
struct kvm_vcpu_arch arch;
};
gfn_t base_gfn;
unsigned long npages;
unsigned long flags;
- unsigned long *rmap;
unsigned long *dirty_bitmap;
struct kvm_arch_memory_slot arch;
unsigned long userspace_addr;
struct {
spinlock_t lock;
struct list_head items;
+ struct list_head resampler_list;
+ struct mutex resampler_lock;
} irqfds;
struct list_head ioeventfds;
#endif
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
- void vcpu_load(struct kvm_vcpu *vcpu);
+ int __must_check vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
return slot;
}
- #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
- #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
- static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-
- extern struct page *bad_page;
- extern struct page *fault_page;
-
- extern pfn_t bad_pfn;
- extern pfn_t fault_pfn;
-
- int is_error_page(struct page *page);
- int is_error_pfn(pfn_t pfn);
- int is_hwpoison_pfn(pfn_t pfn);
- int is_fault_pfn(pfn_t pfn);
- int is_noslot_pfn(pfn_t pfn);
- int is_invalid_pfn(pfn_t pfn);
- int kvm_is_error_hva(unsigned long addr);
int kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc);
int user_alloc);
bool kvm_largepages_enabled(void);
void kvm_disable_largepages(void);
- void kvm_arch_flush_shadow(struct kvm *kvm);
+ /* flush all memory translations */
+ void kvm_arch_flush_shadow_all(struct kvm *kvm);
+ /* flush memory translations pointing to 'slot' */
+ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
int nr_pages);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
+ unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
- pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
bool write_fault, bool *writable);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable);
- pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn);
- void kvm_release_pfn_dirty(pfn_t);
+ pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
+ pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+
+ void kvm_release_pfn_dirty(pfn_t pfn);
void kvm_release_pfn_clean(pfn_t pfn);
void kvm_set_pfn_dirty(pfn_t pfn);
void kvm_set_pfn_accessed(pfn_t pfn);
struct
kvm_userspace_memory_region *mem,
int user_alloc);
+ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
- int kvm_is_mmio_pfn(pfn_t pfn);
+ bool kvm_is_mmio_pfn(pfn_t pfn);
struct kvm_irq_ack_notifier {
struct hlist_node link;
static inline void kvm_guest_enter(void)
{
BUG_ON(preemptible());
- account_system_vtime(current);
+ vtime_account(current);
current->flags |= PF_VCPU;
/* KVM does not hold any references to rcu protected data when it
* switches CPU into a guest mode. In fact switching to a guest mode
static inline void kvm_guest_exit(void)
{
- account_system_vtime(current);
+ vtime_account(current);
current->flags &= ~PF_VCPU;
}
return search_memslots(slots, gfn);
}
+ static inline unsigned long
+ __gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+ return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+ }
+
static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
{
return gfn_to_memslot(kvm, gfn)->id;
(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
}
- static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
- gfn_t gfn)
+ static inline gfn_t
+ hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
{
- return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+ gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
+
+ return slot->base_gfn + gfn_offset;
}
static inline gpa_t gfn_to_gpa(gfn_t gfn)
}
}
+ #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+
+ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+ {
+ vcpu->spin_loop.in_spin_loop = val;
+ }
+ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+ {
+ vcpu->spin_loop.dy_eligible = val;
+ }
+
+ #else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
+
+ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+ {
+ }
+
+ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+ {
+ }
+
+ static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+ {
+ return true;
+ }
+
+ #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
#endif
* --------------------------------------------------------------------
*/
+ /*
+ * Resampling irqfds are a special variety of irqfds used to emulate
+ * level triggered interrupts. The interrupt is asserted on eventfd
+ * trigger. On acknowledgement through the irq ack notifier, the
+ * interrupt is de-asserted and userspace is notified through the
+ * resamplefd. All resamplers on the same gsi are de-asserted
+ * together, so we don't need to track the state of each individual
+ * user. We can also therefore share the same irq source ID.
+ */
+ struct _irqfd_resampler {
+ struct kvm *kvm;
+ /*
+ * List of resampling struct _irqfd objects sharing this gsi.
+ * RCU list modified under kvm->irqfds.resampler_lock
+ */
+ struct list_head list;
+ struct kvm_irq_ack_notifier notifier;
+ /*
+ * Entry in list of kvm->irqfd.resampler_list. Use for sharing
+ * resamplers among irqfds on the same gsi.
+ * Accessed and modified under kvm->irqfds.resampler_lock
+ */
+ struct list_head link;
+ };
+
struct _irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
+ /* The resampler used by this irqfd (resampler-only) */
+ struct _irqfd_resampler *resampler;
+ /* Eventfd notified on resample (resampler-only) */
+ struct eventfd_ctx *resamplefd;
+ /* Entry in list of irqfds for a resampler (resampler-only) */
+ struct list_head resampler_link;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm *kvm = irqfd->kvm;
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
- kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+ if (!irqfd->resampler) {
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+ kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+ } else
+ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ irqfd->gsi, 1);
+ }
+
+ /*
+ * Since resampler irqfds share an IRQ source ID, we de-assert once
+ * then notify all of the resampler irqfds using this GSI. We can't
+ * do multiple de-asserts or we risk racing with incoming re-asserts.
+ */
+ static void
+ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
+ {
+ struct _irqfd_resampler *resampler;
+ struct _irqfd *irqfd;
+
+ resampler = container_of(kian, struct _irqfd_resampler, notifier);
+
+ kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ resampler->notifier.gsi, 0);
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
+ eventfd_signal(irqfd->resamplefd, 1);
+
+ rcu_read_unlock();
+ }
+
+ static void
+ irqfd_resampler_shutdown(struct _irqfd *irqfd)
+ {
+ struct _irqfd_resampler *resampler = irqfd->resampler;
+ struct kvm *kvm = resampler->kvm;
+
+ mutex_lock(&kvm->irqfds.resampler_lock);
+
+ list_del_rcu(&irqfd->resampler_link);
+ synchronize_rcu();
+
+ if (list_empty(&resampler->list)) {
+ list_del(&resampler->link);
+ kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
+ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ resampler->notifier.gsi, 0);
+ kfree(resampler);
+ }
+
+ mutex_unlock(&kvm->irqfds.resampler_lock);
}
/*
* We know no new events will be scheduled at this point, so block
* until all previously outstanding events have completed
*/
- flush_work_sync(&irqfd->inject);
+ flush_work(&irqfd->inject);
+ if (irqfd->resampler) {
+ irqfd_resampler_shutdown(irqfd);
+ eventfd_ctx_put(irqfd->resamplefd);
+ }
+
/*
* It is now safe to release the object's resources
*/
struct kvm_irq_routing_table *irq_rt;
struct _irqfd *irqfd, *tmp;
struct file *file = NULL;
- struct eventfd_ctx *eventfd = NULL;
+ struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
unsigned int events;
irqfd->eventfd = eventfd;
+ if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
+ struct _irqfd_resampler *resampler;
+
+ resamplefd = eventfd_ctx_fdget(args->resamplefd);
+ if (IS_ERR(resamplefd)) {
+ ret = PTR_ERR(resamplefd);
+ goto fail;
+ }
+
+ irqfd->resamplefd = resamplefd;
+ INIT_LIST_HEAD(&irqfd->resampler_link);
+
+ mutex_lock(&kvm->irqfds.resampler_lock);
+
+ list_for_each_entry(resampler,
+ &kvm->irqfds.resampler_list, list) {
+ if (resampler->notifier.gsi == irqfd->gsi) {
+ irqfd->resampler = resampler;
+ break;
+ }
+ }
+
+ if (!irqfd->resampler) {
+ resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+ if (!resampler) {
+ ret = -ENOMEM;
+ mutex_unlock(&kvm->irqfds.resampler_lock);
+ goto fail;
+ }
+
+ resampler->kvm = kvm;
+ INIT_LIST_HEAD(&resampler->list);
+ resampler->notifier.gsi = irqfd->gsi;
+ resampler->notifier.irq_acked = irqfd_resampler_ack;
+ INIT_LIST_HEAD(&resampler->link);
+
+ list_add(&resampler->link, &kvm->irqfds.resampler_list);
+ kvm_register_irq_ack_notifier(kvm,
+ &resampler->notifier);
+ irqfd->resampler = resampler;
+ }
+
+ list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
+ synchronize_rcu();
+
+ mutex_unlock(&kvm->irqfds.resampler_lock);
+ }
+
/*
* Install our own custom wake-up handling so we are notified via
* a callback whenever someone signals the underlying eventfd
return 0;
fail:
+ if (irqfd->resampler)
+ irqfd_resampler_shutdown(irqfd);
+
+ if (resamplefd && !IS_ERR(resamplefd))
+ eventfd_ctx_put(resamplefd);
+
if (eventfd && !IS_ERR(eventfd))
eventfd_ctx_put(eventfd);
{
spin_lock_init(&kvm->irqfds.lock);
INIT_LIST_HEAD(&kvm->irqfds.items);
+ INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
+ mutex_init(&kvm->irqfds.resampler_lock);
INIT_LIST_HEAD(&kvm->ioeventfds);
}
int
kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
- if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN)
+ if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
return -EINVAL;
if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
static bool largepages_enabled = true;
- static struct page *hwpoison_page;
- static pfn_t hwpoison_pfn;
-
- struct page *fault_page;
- pfn_t fault_pfn;
-
- inline int kvm_is_mmio_pfn(pfn_t pfn)
+ bool kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn)) {
int reserved;
/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
- void vcpu_load(struct kvm_vcpu *vcpu)
+ int vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu;
- mutex_lock(&vcpu->mutex);
+ if (mutex_lock_killable(&vcpu->mutex))
+ return -EINTR;
if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
/* The thread running this VCPU changed. */
struct pid *oldpid = vcpu->pid;
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
put_cpu();
+ return 0;
}
void vcpu_put(struct kvm_vcpu *vcpu)
}
vcpu->run = page_address(page);
+ kvm_vcpu_set_in_spin_loop(vcpu, false);
+ kvm_vcpu_set_dy_eligible(vcpu, false);
+
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
goto fail_free_run;
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
- for (; start < end; start += PAGE_SIZE)
- need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
int idx;
idx = srcu_read_lock(&kvm->srcu);
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_all(kvm);
srcu_read_unlock(&kvm->srcu, idx);
}
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
- if (!dont || free->rmap != dont->rmap)
- vfree(free->rmap);
-
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
kvm_destroy_dirty_bitmap(free);
kvm_arch_free_memslot(free, dont);
free->npages = 0;
- free->rmap = NULL;
}
void kvm_free_physmem(struct kvm *kvm)
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
#else
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_all(kvm);
#endif
kvm_arch_destroy_vm(kvm);
kvm_free_physmem(kvm);
slots->generation++;
}
+ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+ {
+ u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+
+ #ifdef KVM_CAP_READONLY_MEM
+ valid_flags |= KVM_MEM_READONLY;
+ #endif
+
+ if (mem->flags & ~valid_flags)
+ return -EINVAL;
+
+ return 0;
+ }
+
/*
* Allocate some memory and give it an address in the guest physical address
* space.
struct kvm_memory_slot old, new;
struct kvm_memslots *slots, *old_memslots;
+ r = check_memory_region_flags(mem);
+ if (r)
+ goto out;
+
r = -EINVAL;
/* General sanity checks */
if (mem->memory_size & (PAGE_SIZE - 1))
if (npages && !old.npages) {
new.user_alloc = user_alloc;
new.userspace_addr = mem->userspace_addr;
- #ifndef CONFIG_S390
- new.rmap = vzalloc(npages * sizeof(*new.rmap));
- if (!new.rmap)
- goto out_free;
- #endif /* not defined CONFIG_S390 */
+
if (kvm_arch_create_memslot(&new, npages))
goto out_free;
}
/* destroy any largepage mappings for dirty tracking */
}
- if (!npages) {
+ if (!npages || base_gfn != old.base_gfn) {
struct kvm_memory_slot *slot;
r = -ENOMEM;
old_memslots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);
- /* From this point no new shadow pages pointing to a deleted
- * memslot will be created.
+ /* From this point no new shadow pages pointing to a deleted,
+ * or moved, memslot will be created.
*
* validation of sp->gfn happens in:
* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
* - kvm_is_visible_gfn (mmu_check_roots)
*/
- kvm_arch_flush_shadow(kvm);
+ kvm_arch_flush_shadow_memslot(kvm, slot);
kfree(old_memslots);
}
/* actual memory is freed via old in kvm_free_physmem_slot below */
if (!npages) {
- new.rmap = NULL;
new.dirty_bitmap = NULL;
memset(&new.arch, 0, sizeof(new.arch));
}
kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
- /*
- * If the new memory slot is created, we need to clear all
- * mmio sptes.
- */
- if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
- kvm_arch_flush_shadow(kvm);
-
kvm_free_physmem_slot(&old, &new);
kfree(old_memslots);
}
EXPORT_SYMBOL_GPL(kvm_disable_largepages);
- int is_error_page(struct page *page)
- {
- return page == bad_page || page == hwpoison_page || page == fault_page;
- }
- EXPORT_SYMBOL_GPL(is_error_page);
-
- int is_error_pfn(pfn_t pfn)
- {
- return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
- }
- EXPORT_SYMBOL_GPL(is_error_pfn);
-
- int is_hwpoison_pfn(pfn_t pfn)
- {
- return pfn == hwpoison_pfn;
- }
- EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
-
- int is_fault_pfn(pfn_t pfn)
- {
- return pfn == fault_pfn;
- }
- EXPORT_SYMBOL_GPL(is_fault_pfn);
-
- int is_noslot_pfn(pfn_t pfn)
- {
- return pfn == bad_pfn;
- }
- EXPORT_SYMBOL_GPL(is_noslot_pfn);
-
- int is_invalid_pfn(pfn_t pfn)
- {
- return pfn == hwpoison_pfn || pfn == fault_pfn;
- }
- EXPORT_SYMBOL_GPL(is_invalid_pfn);
-
- static inline unsigned long bad_hva(void)
- {
- return PAGE_OFFSET;
- }
-
- int kvm_is_error_hva(unsigned long addr)
- {
- return addr == bad_hva();
- }
- EXPORT_SYMBOL_GPL(kvm_is_error_hva);
-
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
return __gfn_to_memslot(kvm_memslots(kvm), gfn);
return size;
}
- static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
- gfn_t *nr_pages)
+ static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+ {
+ return slot->flags & KVM_MEM_READONLY;
+ }
+
+ static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages, bool write)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
- return bad_hva();
+ return KVM_HVA_ERR_BAD;
+
+ if (memslot_is_readonly(slot) && write)
+ return KVM_HVA_ERR_RO_BAD;
if (nr_pages)
*nr_pages = slot->npages - (gfn - slot->base_gfn);
- return gfn_to_hva_memslot(slot, gfn);
+ return __gfn_to_hva_memslot(slot, gfn);
}
+ static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ gfn_t *nr_pages)
+ {
+ return __gfn_to_hva_many(slot, gfn, nr_pages, true);
+ }
+
+ unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+ gfn_t gfn)
+ {
+ return gfn_to_hva_many(slot, gfn, NULL);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
+
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
- static pfn_t get_fault_pfn(void)
+ /*
+ * The hva returned by this function is only allowed to be read.
+ * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
+ */
+ static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
+ {
+ return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
+ }
+
+ static int kvm_read_hva(void *data, void __user *hva, int len)
{
- get_page(fault_page);
- return fault_pfn;
+ return __copy_from_user(data, hva, len);
+ }
+
+ static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
+ {
+ return __copy_from_user_inatomic(data, hva, len);
}
int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
return rc == -EHWPOISON;
}
- static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
- bool *async, bool write_fault, bool *writable)
+ /*
+ * The atomic path to get the writable pfn which will be stored in @pfn,
+ * true indicates success, otherwise false is returned.
+ */
+ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable, pfn_t *pfn)
{
struct page *page[1];
- int npages = 0;
- pfn_t pfn;
+ int npages;
- /* we can do it either atomically or asynchronously, not both */
- BUG_ON(atomic && async);
+ if (!(async || atomic))
+ return false;
- BUG_ON(!write_fault && !writable);
+ /*
+ * Fast pin a writable pfn only if it is a write fault request
+ * or the caller allows to map a writable pfn for a read fault
+ * request.
+ */
+ if (!(write_fault || writable))
+ return false;
- if (writable)
- *writable = true;
+ npages = __get_user_pages_fast(addr, 1, 1, page);
+ if (npages == 1) {
+ *pfn = page_to_pfn(page[0]);
- if (atomic || async)
- npages = __get_user_pages_fast(addr, 1, 1, page);
+ if (writable)
+ *writable = true;
+ return true;
+ }
- if (unlikely(npages != 1) && !atomic) {
- might_sleep();
+ return false;
+ }
- if (writable)
- *writable = write_fault;
+ /*
+ * The slow path to get the pfn of the specified host virtual address,
+ * 1 indicates success, -errno is returned if error is detected.
+ */
+ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
+ bool *writable, pfn_t *pfn)
+ {
+ struct page *page[1];
+ int npages = 0;
- if (async) {
- down_read(¤t->mm->mmap_sem);
- npages = get_user_page_nowait(current, current->mm,
- addr, write_fault, page);
- up_read(¤t->mm->mmap_sem);
- } else
- npages = get_user_pages_fast(addr, 1, write_fault,
- page);
-
- /* map read fault as writable if possible */
- if (unlikely(!write_fault) && npages == 1) {
- struct page *wpage[1];
-
- npages = __get_user_pages_fast(addr, 1, 1, wpage);
- if (npages == 1) {
- *writable = true;
- put_page(page[0]);
- page[0] = wpage[0];
- }
- npages = 1;
+ might_sleep();
+
+ if (writable)
+ *writable = write_fault;
+
+ if (async) {
+ down_read(¤t->mm->mmap_sem);
+ npages = get_user_page_nowait(current, current->mm,
+ addr, write_fault, page);
+ up_read(¤t->mm->mmap_sem);
+ } else
+ npages = get_user_pages_fast(addr, 1, write_fault,
+ page);
+ if (npages != 1)
+ return npages;
+
+ /* map read fault as writable if possible */
+ if (unlikely(!write_fault) && writable) {
+ struct page *wpage[1];
+
+ npages = __get_user_pages_fast(addr, 1, 1, wpage);
+ if (npages == 1) {
+ *writable = true;
+ put_page(page[0]);
+ page[0] = wpage[0];
}
+
+ npages = 1;
}
+ *pfn = page_to_pfn(page[0]);
+ return npages;
+ }
- if (unlikely(npages != 1)) {
- struct vm_area_struct *vma;
+ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+ {
+ if (unlikely(!(vma->vm_flags & VM_READ)))
+ return false;
- if (atomic)
- return get_fault_pfn();
+ if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
+ return false;
- down_read(¤t->mm->mmap_sem);
- if (npages == -EHWPOISON ||
- (!async && check_user_page_hwpoison(addr))) {
- up_read(¤t->mm->mmap_sem);
- get_page(hwpoison_page);
- return page_to_pfn(hwpoison_page);
- }
+ return true;
+ }
- vma = find_vma_intersection(current->mm, addr, addr+1);
-
- if (vma == NULL)
- pfn = get_fault_pfn();
- else if ((vma->vm_flags & VM_PFNMAP)) {
- pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- BUG_ON(!kvm_is_mmio_pfn(pfn));
- } else {
- if (async && (vma->vm_flags & VM_WRITE))
- *async = true;
- pfn = get_fault_pfn();
- }
- up_read(¤t->mm->mmap_sem);
- } else
- pfn = page_to_pfn(page[0]);
+ /*
+ * Pin guest page in memory and return its pfn.
+ * @addr: host virtual address which maps memory to the guest
+ * @atomic: whether this function can sleep
+ * @async: whether this function need to wait IO complete if the
+ * host page is not in the memory
+ * @write_fault: whether we should get a writable host page
+ * @writable: whether it allows to map a writable host page for !@write_fault
+ *
+ * The function will map a writable host page for these two cases:
+ * 1): @write_fault = true
+ * 2): @write_fault = false && @writable, @writable will tell the caller
+ * whether the mapping is writable.
+ */
+ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable)
+ {
+ struct vm_area_struct *vma;
+ pfn_t pfn = 0;
+ int npages;
+
+ /* we can do it either atomically or asynchronously, not both */
+ BUG_ON(atomic && async);
+ if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
+ return pfn;
+
+ if (atomic)
+ return KVM_PFN_ERR_FAULT;
+
+ npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+ if (npages == 1)
+ return pfn;
+
+ down_read(¤t->mm->mmap_sem);
+ if (npages == -EHWPOISON ||
+ (!async && check_user_page_hwpoison(addr))) {
+ pfn = KVM_PFN_ERR_HWPOISON;
+ goto exit;
+ }
+
+ vma = find_vma_intersection(current->mm, addr, addr + 1);
+
+ if (vma == NULL)
+ pfn = KVM_PFN_ERR_FAULT;
+ else if ((vma->vm_flags & VM_PFNMAP)) {
+ pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ BUG_ON(!kvm_is_mmio_pfn(pfn));
+ } else {
+ if (async && vma_is_valid(vma, write_fault))
+ *async = true;
+ pfn = KVM_PFN_ERR_FAULT;
+ }
+ exit:
+ up_read(¤t->mm->mmap_sem);
return pfn;
}
- pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+ static pfn_t
+ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+ bool *async, bool write_fault, bool *writable)
{
- return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+ unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+
+ if (addr == KVM_HVA_ERR_RO_BAD)
+ return KVM_PFN_ERR_RO_FAULT;
+
+ if (kvm_is_error_hva(addr))
+ return KVM_PFN_ERR_BAD;
+
+ /* Do not map writable pfn in the readonly memslot. */
+ if (writable && memslot_is_readonly(slot)) {
+ *writable = false;
+ writable = NULL;
+ }
+
+ return hva_to_pfn(addr, atomic, async, write_fault,
+ writable);
}
- EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
bool write_fault, bool *writable)
{
- unsigned long addr;
+ struct kvm_memory_slot *slot;
if (async)
*async = false;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr)) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
- }
+ slot = gfn_to_memslot(kvm, gfn);
- return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+ return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+ writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
- pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn)
+ pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+ return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+ }
+
+ pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
{
- unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+ return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
}
+ EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
int nr_pages)
}
EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+ static struct page *kvm_pfn_to_page(pfn_t pfn)
+ {
+ if (is_error_pfn(pfn))
+ return KVM_ERR_PTR_BAD_PAGE;
+
+ if (kvm_is_mmio_pfn(pfn)) {
+ WARN_ON(1);
+ return KVM_ERR_PTR_BAD_PAGE;
+ }
+
+ return pfn_to_page(pfn);
+ }
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
pfn = gfn_to_pfn(kvm, gfn);
- if (!kvm_is_mmio_pfn(pfn))
- return pfn_to_page(pfn);
-
- WARN_ON(kvm_is_mmio_pfn(pfn));
- get_page(bad_page);
- return bad_page;
+ return kvm_pfn_to_page(pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_page_clean(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_clean(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);
void kvm_release_pfn_clean(pfn_t pfn)
{
+ WARN_ON(is_error_pfn(pfn));
+
if (!kvm_is_mmio_pfn(pfn))
put_page(pfn_to_page(pfn));
}
void kvm_release_page_dirty(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
int r;
unsigned long addr;
- addr = gfn_to_hva(kvm, gfn);
+ addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
- r = __copy_from_user(data, (void __user *)addr + offset, len);
+ r = kvm_read_hva(data, (void __user *)addr + offset, len);
if (r)
return -EFAULT;
return 0;
gfn_t gfn = gpa >> PAGE_SHIFT;
int offset = offset_in_page(gpa);
- addr = gfn_to_hva(kvm, gfn);
+ addr = gfn_to_hva_read(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
pagefault_disable();
- r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+ r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
pagefault_enable();
if (r)
return -EFAULT;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
+ #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+ /*
+ * Helper that checks whether a VCPU is eligible for directed yield.
+ * Most eligible candidate to yield is decided by following heuristics:
+ *
+ * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+ * (preempted lock holder), indicated by @in_spin_loop.
+ * Set at the beiginning and cleared at the end of interception/PLE handler.
+ *
+ * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+ * chance last time (mostly it has become eligible now since we have probably
+ * yielded to lockholder in last iteration. This is done by toggling
+ * @dy_eligible each time a VCPU checked for eligibility.)
+ *
+ * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+ * to preempted lock-holder could result in wrong VCPU selection and CPU
+ * burning. Giving priority for a potential lock-holder increases lock
+ * progress.
+ *
+ * Since algorithm is based on heuristics, accessing another VCPU data without
+ * locking does not harm. It may result in trying to yield to same VCPU, fail
+ * and continue with next VCPU and so on.
+ */
+ bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+ {
+ bool eligible;
+
+ eligible = !vcpu->spin_loop.in_spin_loop ||
+ (vcpu->spin_loop.in_spin_loop &&
+ vcpu->spin_loop.dy_eligible);
+
+ if (vcpu->spin_loop.in_spin_loop)
+ kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+
+ return eligible;
+ }
+ #endif
void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
struct kvm *kvm = me->kvm;
int pass;
int i;
+ kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
* currently running, because it got preempted by something
continue;
if (waitqueue_active(&vcpu->wq))
continue;
+ if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+ continue;
if (kvm_vcpu_yield_to(vcpu)) {
kvm->last_boosted_vcpu = i;
yielded = 1;
}
}
}
+ kvm_vcpu_set_in_spin_loop(me, false);
+
+ /* Ensure vcpu is not eligible during next spinloop */
+ kvm_vcpu_set_dy_eligible(me, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
#endif
- vcpu_load(vcpu);
+ r = vcpu_load(vcpu);
+ if (r)
+ return r;
switch (ioctl) {
case KVM_RUN:
r = -EINVAL;
if (copy_from_user(&csigset, sigmask_arg->sigset,
sizeof csigset))
goto out;
- }
- sigset_from_compat(&sigset, &csigset);
- r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+ sigset_from_compat(&sigset, &csigset);
+ r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+ } else
+ r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
break;
}
default:
r = kvm_send_userspace_msi(kvm, &msi);
break;
}
+ #endif
+ #ifdef __KVM_HAVE_IRQ_LINE
+ case KVM_IRQ_LINE_STATUS:
+ case KVM_IRQ_LINE: {
+ struct kvm_irq_level irq_event;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq_event, argp, sizeof irq_event))
+ goto out;
+
+ r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (ioctl == KVM_IRQ_LINE_STATUS) {
+ if (copy_to_user(argp, &irq_event, sizeof irq_event))
+ goto out;
+ }
+
+ r = 0;
+ break;
+ }
#endif
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
.resume = kvm_resume,
};
- struct page *bad_page;
- pfn_t bad_pfn;
-
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
if (r)
goto out_fail;
- bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (bad_page == NULL) {
- r = -ENOMEM;
- goto out;
- }
-
- bad_pfn = page_to_pfn(bad_page);
-
- hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (hwpoison_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- hwpoison_pfn = page_to_pfn(hwpoison_page);
-
- fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (fault_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- fault_pfn = page_to_pfn(fault_page);
-
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
out_free_0a:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
- if (fault_page)
- __free_page(fault_page);
- if (hwpoison_page)
- __free_page(hwpoison_page);
- __free_page(bad_page);
- out:
kvm_arch_exit();
out_fail:
return r;
kvm_arch_hardware_unsetup();
kvm_arch_exit();
free_cpumask_var(cpus_hardware_enabled);
- __free_page(fault_page);
- __free_page(hwpoison_page);
- __free_page(bad_page);
}
EXPORT_SYMBOL_GPL(kvm_exit);