#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
+ #include <linux/radix-tree.h>
#include <asm/bug.h>
#include <asm/page.h>
/**
* struct gmap_struct - guest address space
+ * @crst_list: list of all crst tables used in the guest address space
* @mm: pointer to the parent mm_struct
+ * @guest_to_host: radix tree with guest to host address translation
+ * @host_to_guest: radix tree with pointer to segment table entries
+ * @guest_table_lock: spinlock to protect all entries in the guest page table
* @table: pointer to the page directory
* @asce: address space control element for gmap page table
- * @crst_list: list of all crst tables used in the guest address space
* @pfault_enabled: defines if pfaults are applicable for the guest
*/
struct gmap {
struct list_head list;
+ struct list_head crst_list;
struct mm_struct *mm;
+ struct radix_tree_root guest_to_host;
+ struct radix_tree_root host_to_guest;
+ spinlock_t guest_table_lock;
unsigned long *table;
unsigned long asce;
+ unsigned long asce_end;
void *private;
- struct list_head crst_list;
bool pfault_enabled;
};
- /**
- * struct gmap_rmap - reverse mapping for segment table entries
- * @gmap: pointer to the gmap_struct
- * @entry: pointer to a segment table entry
- * @vmaddr: virtual address in the guest address space
- */
- struct gmap_rmap {
- struct list_head list;
- struct gmap *gmap;
- unsigned long *entry;
- unsigned long vmaddr;
- };
-
- /**
- * struct gmap_pgtable - gmap information attached to a page table
- * @vmaddr: address of the 1MB segment in the process virtual memory
- * @mapper: list of segment table entries mapping a page table
- */
- struct gmap_pgtable {
- unsigned long vmaddr;
- struct list_head mapper;
- };
-
/**
* struct gmap_notifier - notify function block for page invalidation
* @notifier_call: address of callback function
*/
struct gmap_notifier {
struct list_head list;
- void (*notifier_call)(struct gmap *gmap, unsigned long address);
+ void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
};
- struct gmap *gmap_alloc(struct mm_struct *mm);
+ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
void gmap_free(struct gmap *gmap);
void gmap_enable(struct gmap *gmap);
void gmap_disable(struct gmap *gmap);
int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long to, unsigned long len);
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
- unsigned long __gmap_translate(unsigned long address, struct gmap *);
- unsigned long gmap_translate(unsigned long address, struct gmap *);
- unsigned long __gmap_fault(unsigned long address, struct gmap *);
- unsigned long gmap_fault(unsigned long address, struct gmap *);
- void gmap_discard(unsigned long from, unsigned long to, struct gmap *);
- void __gmap_zap(unsigned long address, struct gmap *);
+ unsigned long __gmap_translate(struct gmap *, unsigned long gaddr);
+ unsigned long gmap_translate(struct gmap *, unsigned long gaddr);
+ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr);
+ int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags);
+ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
+ void __gmap_zap(struct gmap *, unsigned long gaddr);
bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *);
void gmap_register_ipte_notifier(struct gmap_notifier *);
void gmap_unregister_ipte_notifier(struct gmap_notifier *);
int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
- void gmap_do_ipte_notify(struct mm_struct *, pte_t *);
+ void gmap_do_ipte_notify(struct mm_struct *, unsigned long addr, pte_t *);
static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
+ unsigned long addr,
pte_t *ptep, pgste_t pgste)
{
#ifdef CONFIG_PGSTE
if (pgste_val(pgste) & PGSTE_IN_BIT) {
pgste_val(pgste) &= ~PGSTE_IN_BIT;
- gmap_do_ipte_notify(mm, ptep);
+ gmap_do_ipte_notify(mm, addr, ptep);
}
#endif
return pgste;
pgste_val(pgste) &= ~PGSTE_UC_BIT;
pte = *ptep;
if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
- pgste = pgste_ipte_notify(mm, ptep, pgste);
+ pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
__ptep_ipte(addr, ptep);
if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
pte_val(pte) |= _PAGE_PROTECT;
unsigned long addr, pte_t *ptep)
{
pgste_t pgste;
- pte_t pte;
+ pte_t pte, oldpte;
int young;
if (mm_has_pgste(vma->vm_mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_ipte_notify(vma->vm_mm, ptep, pgste);
+ pgste = pgste_ipte_notify(vma->vm_mm, addr, ptep, pgste);
}
- pte = *ptep;
+ oldpte = pte = *ptep;
ptep_flush_direct(vma->vm_mm, addr, ptep);
young = pte_young(pte);
pte = pte_mkold(pte);
if (mm_has_pgste(vma->vm_mm)) {
+ pgste = pgste_update_all(&oldpte, pgste, vma->vm_mm);
pgste = pgste_set_pte(ptep, pgste, pte);
pgste_set_unlock(ptep, pgste);
} else
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_ipte_notify(mm, ptep, pgste);
+ pgste = pgste_ipte_notify(mm, address, ptep, pgste);
}
pte = *ptep;
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
- pgste_ipte_notify(mm, ptep, pgste);
+ pgste_ipte_notify(mm, address, ptep, pgste);
}
pte = *ptep;
if (mm_has_pgste(vma->vm_mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_ipte_notify(vma->vm_mm, ptep, pgste);
+ pgste = pgste_ipte_notify(vma->vm_mm, address, ptep, pgste);
}
pte = *ptep;
if (!full && mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_ipte_notify(mm, ptep, pgste);
+ pgste = pgste_ipte_notify(mm, address, ptep, pgste);
}
pte = *ptep;
if (pte_write(pte)) {
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_ipte_notify(mm, ptep, pgste);
+ pgste = pgste_ipte_notify(mm, address, ptep, pgste);
}
ptep_flush_lazy(mm, address, ptep);
return 0;
if (mm_has_pgste(vma->vm_mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_ipte_notify(vma->vm_mm, ptep, pgste);
+ pgste = pgste_ipte_notify(vma->vm_mm, address, ptep, pgste);
}
ptep_flush_direct(vma->vm_mm, address, ptep);
if (mm_has_pgste(vma->vm_mm)) {
+ pgste_set_key(ptep, pgste, entry, vma->vm_mm);
pgste = pgste_set_pte(ptep, pgste, entry);
pgste_set_unlock(ptep, pgste);
} else
#include <asm/processor.h>
#include <asm/io.h>
+ #include <asm/ioctl.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include "coalesced_mmio.h"
#include "async_pf.h"
+ #include "vfio.h"
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>
static void hardware_disable_all(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
- static void update_memslots(struct kvm_memslots *slots,
- struct kvm_memory_slot *new, u64 last_generation);
static void kvm_release_pfn_dirty(pfn_t pfn);
static void mark_page_dirty_in_slot(struct kvm *kvm,
bool kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn))
- return PageReserved(pfn_to_page(pfn));
+ return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
return true;
}
struct pid *oldpid = vcpu->pid;
struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
rcu_assign_pointer(vcpu->pid, newpid);
- synchronize_rcu();
+ if (oldpid)
+ synchronize_rcu();
put_pid(oldpid);
}
cpu = get_cpu();
{
}
- static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
+ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
{
int i, cpu, me;
cpumask_var_t cpus;
long dirty_count = kvm->tlbs_dirty;
smp_mb();
- if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+ if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm->stat.remote_tlb_flush;
cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
}
void kvm_reload_remote_mmus(struct kvm *kvm)
{
- make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
}
void kvm_make_mclock_inprogress_request(struct kvm *kvm)
{
- make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
}
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
- make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
+
+ kvm_arch_mmu_notifier_invalidate_page(kvm, address);
+
srcu_read_unlock(&kvm->srcu, idx);
}
static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
struct mm_struct *mm,
- unsigned long address)
+ unsigned long start,
+ unsigned long end)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int young, idx;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
- young = kvm_age_hva(kvm, address);
+ young = kvm_age_hva(kvm, start, end);
if (young)
kvm_flush_remote_tlbs(kvm);
kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!kvm->memslots)
goto out_err_no_srcu;
+
+ /*
+ * Init kvm generation close to the maximum to easily test the
+ * code of handling generation number wrap-around.
+ */
+ kvm->memslots->generation = -150;
+
kvm_init_memslots_id(kvm);
if (init_srcu_struct(&kvm->srcu))
goto out_err_no_srcu;
}
static void update_memslots(struct kvm_memslots *slots,
- struct kvm_memory_slot *new,
- u64 last_generation)
+ struct kvm_memory_slot *new)
{
if (new) {
int id = new->id;
if (new->npages != npages)
sort_memslots(slots);
}
-
- slots->generation = last_generation + 1;
}
static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
{
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
- #ifdef KVM_CAP_READONLY_MEM
+ #ifdef __KVM_HAVE_READONLY_MEM
valid_flags |= KVM_MEM_READONLY;
#endif
{
struct kvm_memslots *old_memslots = kvm->memslots;
- update_memslots(slots, new, kvm->memslots->generation);
+ /*
+ * Set the low bit in the generation, which disables SPTE caching
+ * until the end of synchronize_srcu_expedited.
+ */
+ WARN_ON(old_memslots->generation & 1);
+ slots->generation = old_memslots->generation + 1;
+
+ update_memslots(slots, new);
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);
+ /*
+ * Increment the new memslot generation a second time. This prevents
+ * vm exits that race with memslot updates from caching a memslot
+ * generation that will (potentially) be valid forever.
+ */
+ slots->generation++;
+
kvm_arch_memslots_updated(kvm);
return old_memslots;
base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
npages = mem->memory_size >> PAGE_SHIFT;
- r = -EINVAL;
if (npages > KVM_MEM_MAX_NR_PAGES)
goto out;
new.npages = npages;
new.flags = mem->flags;
- r = -EINVAL;
if (npages) {
if (!old.npages)
change = KVM_MR_CREATE;
}
if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
- r = -ENOMEM;
slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
GFP_KERNEL);
if (!slots)
* If writable is set to false, the hva returned by this function is only
* allowed to be read.
*/
- unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
+ gfn_t gfn, bool *writable)
{
- struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
if (!kvm_is_error_hva(hva) && writable)
return hva;
}
+ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+ {
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+
+ return gfn_to_hva_memslot_prot(slot, gfn, writable);
+ }
+
static int kvm_read_hva(void *data, void __user *hva, int len)
{
return __copy_from_user(data, hva, len);
return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
}
+ int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, bool write_fault,
+ struct page **pagep)
+ {
+ int npages;
+ int locked = 1;
+ int flags = FOLL_TOUCH | FOLL_HWPOISON |
+ (pagep ? FOLL_GET : 0) |
+ (write_fault ? FOLL_WRITE : 0);
+
+ /*
+ * If retrying the fault, we get here *not* having allowed the filemap
+ * to wait on the page lock. We should now allow waiting on the IO with
+ * the mmap semaphore released.
+ */
+ down_read(&mm->mmap_sem);
+ npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
+ &locked);
+ if (!locked) {
+ VM_BUG_ON(npages);
+
+ if (!pagep)
+ return 0;
+
+ /*
+ * The previous call has now waited on the IO. Now we can
+ * retry and complete. Pass TRIED to ensure we do not re
+ * schedule async IO (see e.g. filemap_fault).
+ */
+ down_read(&mm->mmap_sem);
+ npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
+ pagep, NULL, NULL);
+ }
+ up_read(&mm->mmap_sem);
+ return npages;
+ }
+
static inline int check_user_page_hwpoison(unsigned long addr)
{
int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
npages = get_user_page_nowait(current, current->mm,
addr, write_fault, page);
up_read(¤t->mm->mmap_sem);
- } else
- npages = get_user_pages_fast(addr, 1, write_fault,
- page);
+ } else {
+ /*
+ * By now we have tried gup_fast, and possibly async_pf, and we
+ * are certainly not atomic. Time to retry the gup, allowing
+ * mmap semaphore to be relinquished in the case of IO.
+ */
+ npages = kvm_get_user_page_io(current, current->mm, addr,
+ write_fault, page);
+ }
if (npages != 1)
return npages;
rcu_read_lock();
pid = rcu_dereference(target->pid);
if (pid)
- task = get_pid_task(target->pid, PIDTYPE_PID);
+ task = get_pid_task(pid, PIDTYPE_PID);
rcu_read_unlock();
if (!task)
return ret;
bool eligible;
eligible = !vcpu->spin_loop.in_spin_loop ||
- (vcpu->spin_loop.in_spin_loop &&
- vcpu->spin_loop.dy_eligible);
+ vcpu->spin_loop.dy_eligible;
if (vcpu->spin_loop.in_spin_loop)
kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
if (vcpu->kvm->mm != current->mm)
return -EIO;
+ if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
+ return -EINVAL;
+
#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
/*
* Special cases: vcpu ioctls that are asynchronous to vcpu execution,
return filp->private_data;
}
+ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+ #ifdef CONFIG_KVM_MPIC
+ [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
+ [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
+ #endif
+
+ #ifdef CONFIG_KVM_XICS
+ [KVM_DEV_TYPE_XICS] = &kvm_xics_ops,
+ #endif
+ };
+
+ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+ {
+ if (type >= ARRAY_SIZE(kvm_device_ops_table))
+ return -ENOSPC;
+
+ if (kvm_device_ops_table[type] != NULL)
+ return -EEXIST;
+
+ kvm_device_ops_table[type] = ops;
+ return 0;
+ }
+
static int kvm_ioctl_create_device(struct kvm *kvm,
struct kvm_create_device *cd)
{
bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
int ret;
- switch (cd->type) {
- #ifdef CONFIG_KVM_MPIC
- case KVM_DEV_TYPE_FSL_MPIC_20:
- case KVM_DEV_TYPE_FSL_MPIC_42:
- ops = &kvm_mpic_ops;
- break;
- #endif
- #ifdef CONFIG_KVM_XICS
- case KVM_DEV_TYPE_XICS:
- ops = &kvm_xics_ops;
- break;
- #endif
- #ifdef CONFIG_KVM_VFIO
- case KVM_DEV_TYPE_VFIO:
- ops = &kvm_vfio_ops;
- break;
- #endif
- #ifdef CONFIG_KVM_ARM_VGIC
- case KVM_DEV_TYPE_ARM_VGIC_V2:
- ops = &kvm_arm_vgic_v2_ops;
- break;
- #endif
- #ifdef CONFIG_S390
- case KVM_DEV_TYPE_FLIC:
- ops = &kvm_flic_ops;
- break;
- #endif
- default:
+ if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
+ return -ENODEV;
+
+ ops = kvm_device_ops_table[cd->type];
+ if (ops == NULL)
return -ENODEV;
- }
if (test)
return 0;
switch (ioctl) {
case KVM_GET_API_VERSION:
- r = -EINVAL;
if (arg)
goto out;
r = KVM_API_VERSION;
r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
break;
case KVM_GET_VCPU_MMAP_SIZE:
- r = -EINVAL;
if (arg)
goto out;
r = PAGE_SIZE; /* struct kvm_run */
cpumask_set_cpu(cpu, cpus_hardware_enabled);
- r = kvm_arch_hardware_enable(NULL);
+ r = kvm_arch_hardware_enable();
if (r) {
cpumask_clear_cpu(cpu, cpus_hardware_enabled);
if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
return;
cpumask_clear_cpu(cpu, cpus_hardware_enabled);
- kvm_arch_hardware_disable(NULL);
+ kvm_arch_hardware_disable();
}
static void hardware_disable(void)
if (vcpu->preempted)
vcpu->preempted = false;
+ kvm_arch_sched_in(vcpu, cpu);
+
kvm_arch_vcpu_load(vcpu, cpu);
}
goto out_undebugfs;
}
+ r = kvm_vfio_ops_init();
+ WARN_ON(r);
+
return 0;
out_undebugfs: