virt/kvm/kvm_main.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine (KVM) Hypervisor
   4  *
   5  * Copyright (C) 2006 Qumranet, Inc.
   6  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   7  *
   8  * Authors:
   9  *   Avi Kivity   <[email protected]>
  10  *   Yaniv Kamay  <[email protected]>
  11  */
  12
  13 #include <kvm/iodev.h>
  14
  15 #include <linux/kvm_host.h>
  16 #include <linux/kvm.h>
  17 #include <linux/module.h>
  18 #include <linux/errno.h>
  19 #include <linux/percpu.h>
  20 #include <linux/mm.h>
  21 #include <linux/miscdevice.h>
  22 #include <linux/vmalloc.h>
  23 #include <linux/reboot.h>
  24 #include <linux/debugfs.h>
  25 #include <linux/highmem.h>
  26 #include <linux/file.h>
  27 #include <linux/syscore_ops.h>
  28 #include <linux/cpu.h>
  29 #include <linux/sched/signal.h>
  30 #include <linux/sched/mm.h>
  31 #include <linux/sched/stat.h>
  32 #include <linux/cpumask.h>
  33 #include <linux/smp.h>
  34 #include <linux/anon_inodes.h>
  35 #include <linux/profile.h>
  36 #include <linux/kvm_para.h>
  37 #include <linux/pagemap.h>
  38 #include <linux/mman.h>
  39 #include <linux/swap.h>
  40 #include <linux/bitops.h>
  41 #include <linux/spinlock.h>
  42 #include <linux/compat.h>
  43 #include <linux/srcu.h>
  44 #include <linux/hugetlb.h>
  45 #include <linux/slab.h>
  46 #include <linux/sort.h>
  47 #include <linux/bsearch.h>
  48 #include <linux/io.h>
  49 #include <linux/lockdep.h>
  50 #include <linux/kthread.h>
  51 #include <linux/suspend.h>
  52
  53 #include <asm/processor.h>
  54 #include <asm/ioctl.h>
  55 #include <linux/uaccess.h>
  56
  57 #include "coalesced_mmio.h"
  58 #include "async_pf.h"
  59 #include "kvm_mm.h"
  60 #include "vfio.h"
  61
  62 #include <trace/events/ipi.h>
  63
  64 #define CREATE_TRACE_POINTS
  65 #include <trace/events/kvm.h>
  66
  67 #include <linux/kvm_dirty_ring.h>
  68
  69
  70 /* Worst case buffer size needed for holding an integer. */
  71 #define ITOA_MAX_LEN 12
  72
  73 MODULE_AUTHOR("Qumranet");
  74 MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
  75 MODULE_LICENSE("GPL");
  76
  77 /* Architectures should define their poll value according to the halt latency */
  78 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
  79 module_param(halt_poll_ns, uint, 0644);
  80 EXPORT_SYMBOL_GPL(halt_poll_ns);
  81
  82 /* Default doubles per-vcpu halt_poll_ns. */
  83 unsigned int halt_poll_ns_grow = 2;
  84 module_param(halt_poll_ns_grow, uint, 0644);
  85 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
  86
  87 /* The start value to grow halt_poll_ns from */
  88 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
  89 module_param(halt_poll_ns_grow_start, uint, 0644);
  90 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
  91
  92 /* Default halves per-vcpu halt_poll_ns. */
  93 unsigned int halt_poll_ns_shrink = 2;
  94 module_param(halt_poll_ns_shrink, uint, 0644);
  95 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  96
  97 /*
  98  * Allow direct access (from KVM or the CPU) without MMU notifier protection
  99  * to unpinned pages.
 100  */
 101 static bool allow_unsafe_mappings;
 102 module_param(allow_unsafe_mappings, bool, 0444);
 103
 104 /*
 105  * Ordering of locks:
 106  *
 107  *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
 108  */
 109
 110 DEFINE_MUTEX(kvm_lock);
 111 LIST_HEAD(vm_list);
 112
 113 static struct kmem_cache *kvm_vcpu_cache;
 114
 115 static __read_mostly struct preempt_ops kvm_preempt_ops;
 116 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
 117
 118 static struct dentry *kvm_debugfs_dir;
 119
 120 static const struct file_operations stat_fops_per_vm;
 121
 122 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 123                            unsigned long arg);
 124 #ifdef CONFIG_KVM_COMPAT
 125 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
 126                                   unsigned long arg);
 127 #define KVM_COMPAT(c)   .compat_ioctl   = (c)
 128 #else
 129 /*
 130  * For architectures that don't implement a compat infrastructure,
 131  * adopt a double line of defense:
 132  * - Prevent a compat task from opening /dev/kvm
 133  * - If the open has been done by a 64bit task, and the KVM fd
 134  *   passed to a compat task, let the ioctls fail.
 135  */
 136 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
 137                                 unsigned long arg) { return -EINVAL; }
 138
 139 static int kvm_no_compat_open(struct inode *inode, struct file *file)
 140 {
 141         return is_compat_task() ? -ENODEV : 0;
 142 }
 143 #define KVM_COMPAT(c)   .compat_ioctl   = kvm_no_compat_ioctl,  \
 144                         .open           = kvm_no_compat_open
 145 #endif
 146 static int kvm_enable_virtualization(void);
 147 static void kvm_disable_virtualization(void);
 148
 149 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 150
 151 #define KVM_EVENT_CREATE_VM 0
 152 #define KVM_EVENT_DESTROY_VM 1
 153 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 154 static unsigned long long kvm_createvm_count;
 155 static unsigned long long kvm_active_vms;
 156
 157 static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
 158
 159 __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
 160 {
 161 }
 162
 163 /*
 164  * Switches to specified vcpu, until a matching vcpu_put()
 165  */
 166 void vcpu_load(struct kvm_vcpu *vcpu)
 167 {
 168         int cpu = get_cpu();
 169
 170         __this_cpu_write(kvm_running_vcpu, vcpu);
 171         preempt_notifier_register(&vcpu->preempt_notifier);
 172         kvm_arch_vcpu_load(vcpu, cpu);
 173         put_cpu();
 174 }
 175 EXPORT_SYMBOL_GPL(vcpu_load);
 176
 177 void vcpu_put(struct kvm_vcpu *vcpu)
 178 {
 179         preempt_disable();
 180         kvm_arch_vcpu_put(vcpu);
 181         preempt_notifier_unregister(&vcpu->preempt_notifier);
 182         __this_cpu_write(kvm_running_vcpu, NULL);
 183         preempt_enable();
 184 }
 185 EXPORT_SYMBOL_GPL(vcpu_put);
 186
 187 /* TODO: merge with kvm_arch_vcpu_should_kick */
 188 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
 189 {
 190         int mode = kvm_vcpu_exiting_guest_mode(vcpu);
 191
 192         /*
 193          * We need to wait for the VCPU to reenable interrupts and get out of
 194          * READING_SHADOW_PAGE_TABLES mode.
 195          */
 196         if (req & KVM_REQUEST_WAIT)
 197                 return mode != OUTSIDE_GUEST_MODE;
 198
 199         /*
 200          * Need to kick a running VCPU, but otherwise there is nothing to do.
 201          */
 202         return mode == IN_GUEST_MODE;
 203 }
 204
 205 static void ack_kick(void *_completed)
 206 {
 207 }
 208
 209 static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
 210 {
 211         if (cpumask_empty(cpus))
 212                 return false;
 213
 214         smp_call_function_many(cpus, ack_kick, NULL, wait);
 215         return true;
 216 }
 217
 218 static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
 219                                   struct cpumask *tmp, int current_cpu)
 220 {
 221         int cpu;
 222
 223         if (likely(!(req & KVM_REQUEST_NO_ACTION)))
 224                 __kvm_make_request(req, vcpu);
 225
 226         if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
 227                 return;
 228
 229         /*
 230          * Note, the vCPU could get migrated to a different pCPU at any point
 231          * after kvm_request_needs_ipi(), which could result in sending an IPI
 232          * to the previous pCPU.  But, that's OK because the purpose of the IPI
 233          * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
 234          * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
 235          * after this point is also OK, as the requirement is only that KVM wait
 236          * for vCPUs that were reading SPTEs _before_ any changes were
 237          * finalized. See kvm_vcpu_kick() for more details on handling requests.
 238          */
 239         if (kvm_request_needs_ipi(vcpu, req)) {
 240                 cpu = READ_ONCE(vcpu->cpu);
 241                 if (cpu != -1 && cpu != current_cpu)
 242                         __cpumask_set_cpu(cpu, tmp);
 243         }
 244 }
 245
 246 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 247                                  unsigned long *vcpu_bitmap)
 248 {
 249         struct kvm_vcpu *vcpu;
 250         struct cpumask *cpus;
 251         int i, me;
 252         bool called;
 253
 254         me = get_cpu();
 255
 256         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 257         cpumask_clear(cpus);
 258
 259         for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
 260                 vcpu = kvm_get_vcpu(kvm, i);
 261                 if (!vcpu)
 262                         continue;
 263                 kvm_make_vcpu_request(vcpu, req, cpus, me);
 264         }
 265
 266         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 267         put_cpu();
 268
 269         return called;
 270 }
 271
 272 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 273 {
 274         struct kvm_vcpu *vcpu;
 275         struct cpumask *cpus;
 276         unsigned long i;
 277         bool called;
 278         int me;
 279
 280         me = get_cpu();
 281
 282         cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 283         cpumask_clear(cpus);
 284
 285         kvm_for_each_vcpu(i, vcpu, kvm)
 286                 kvm_make_vcpu_request(vcpu, req, cpus, me);
 287
 288         called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 289         put_cpu();
 290
 291         return called;
 292 }
 293 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
 294
 295 void kvm_flush_remote_tlbs(struct kvm *kvm)
 296 {
 297         ++kvm->stat.generic.remote_tlb_flush_requests;
 298
 299         /*
 300          * We want to publish modifications to the page tables before reading
 301          * mode. Pairs with a memory barrier in arch-specific code.
 302          * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
 303          * and smp_mb in walk_shadow_page_lockless_begin/end.
 304          * - powerpc: smp_mb in kvmppc_prepare_to_enter.
 305          *
 306          * There is already an smp_mb__after_atomic() before
 307          * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
 308          * barrier here.
 309          */
 310         if (!kvm_arch_flush_remote_tlbs(kvm)
 311             || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 312                 ++kvm->stat.generic.remote_tlb_flush;
 313 }
 314 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 315
 316 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
 317 {
 318         if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
 319                 return;
 320
 321         /*
 322          * Fall back to a flushing entire TLBs if the architecture range-based
 323          * TLB invalidation is unsupported or can't be performed for whatever
 324          * reason.
 325          */
 326         kvm_flush_remote_tlbs(kvm);
 327 }
 328
 329 void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
 330                                    const struct kvm_memory_slot *memslot)
 331 {
 332         /*
 333          * All current use cases for flushing the TLBs for a specific memslot
 334          * are related to dirty logging, and many do the TLB flush out of
 335          * mmu_lock. The interaction between the various operations on memslot
 336          * must be serialized by slots_locks to ensure the TLB flush from one
 337          * operation is observed by any other operation on the same memslot.
 338          */
 339         lockdep_assert_held(&kvm->slots_lock);
 340         kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
 341 }
 342
 343 static void kvm_flush_shadow_all(struct kvm *kvm)
 344 {
 345         kvm_arch_flush_shadow_all(kvm);
 346         kvm_arch_guest_memory_reclaimed(kvm);
 347 }
 348
 349 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 350 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 351                                                gfp_t gfp_flags)
 352 {
 353         void *page;
 354
 355         gfp_flags |= mc->gfp_zero;
 356
 357         if (mc->kmem_cache)
 358                 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
 359
 360         page = (void *)__get_free_page(gfp_flags);
 361         if (page && mc->init_value)
 362                 memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
 363         return page;
 364 }
 365
 366 int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
 367 {
 368         gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
 369         void *obj;
 370
 371         if (mc->nobjs >= min)
 372                 return 0;
 373
 374         if (unlikely(!mc->objects)) {
 375                 if (WARN_ON_ONCE(!capacity))
 376                         return -EIO;
 377
 378                 /*
 379                  * Custom init values can be used only for page allocations,
 380                  * and obviously conflict with __GFP_ZERO.
 381                  */
 382                 if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
 383                         return -EIO;
 384
 385                 mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
 386                 if (!mc->objects)
 387                         return -ENOMEM;
 388
 389                 mc->capacity = capacity;
 390         }
 391
 392         /* It is illegal to request a different capacity across topups. */
 393         if (WARN_ON_ONCE(mc->capacity != capacity))
 394                 return -EIO;
 395
 396         while (mc->nobjs < mc->capacity) {
 397                 obj = mmu_memory_cache_alloc_obj(mc, gfp);
 398                 if (!obj)
 399                         return mc->nobjs >= min ? 0 : -ENOMEM;
 400                 mc->objects[mc->nobjs++] = obj;
 401         }
 402         return 0;
 403 }
 404
 405 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 406 {
 407         return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
 408 }
 409
 410 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 411 {
 412         return mc->nobjs;
 413 }
 414
 415 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 416 {
 417         while (mc->nobjs) {
 418                 if (mc->kmem_cache)
 419                         kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
 420                 else
 421                         free_page((unsigned long)mc->objects[--mc->nobjs]);
 422         }
 423
 424         kvfree(mc->objects);
 425
 426         mc->objects = NULL;
 427         mc->capacity = 0;
 428 }
 429
 430 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 431 {
 432         void *p;
 433
 434         if (WARN_ON(!mc->nobjs))
 435                 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
 436         else
 437                 p = mc->objects[--mc->nobjs];
 438         BUG_ON(!p);
 439         return p;
 440 }
 441 #endif
 442
 443 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 444 {
 445         mutex_init(&vcpu->mutex);
 446         vcpu->cpu = -1;
 447         vcpu->kvm = kvm;
 448         vcpu->vcpu_id = id;
 449         vcpu->pid = NULL;
 450         rwlock_init(&vcpu->pid_lock);
 451 #ifndef __KVM_HAVE_ARCH_WQP
 452         rcuwait_init(&vcpu->wait);
 453 #endif
 454         kvm_async_pf_vcpu_init(vcpu);
 455
 456         kvm_vcpu_set_in_spin_loop(vcpu, false);
 457         kvm_vcpu_set_dy_eligible(vcpu, false);
 458         vcpu->preempted = false;
 459         vcpu->ready = false;
 460         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 461         vcpu->last_used_slot = NULL;
 462
 463         /* Fill the stats id string for the vcpu */
 464         snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
 465                  task_pid_nr(current), id);
 466 }
 467
 468 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 469 {
 470         kvm_arch_vcpu_destroy(vcpu);
 471         kvm_dirty_ring_free(&vcpu->dirty_ring);
 472
 473         /*
 474          * No need for rcu_read_lock as VCPU_RUN is the only place that changes
 475          * the vcpu->pid pointer, and at destruction time all file descriptors
 476          * are already gone.
 477          */
 478         put_pid(vcpu->pid);
 479
 480         free_page((unsigned long)vcpu->run);
 481         kmem_cache_free(kvm_vcpu_cache, vcpu);
 482 }
 483
 484 void kvm_destroy_vcpus(struct kvm *kvm)
 485 {
 486         unsigned long i;
 487         struct kvm_vcpu *vcpu;
 488
 489         kvm_for_each_vcpu(i, vcpu, kvm) {
 490                 kvm_vcpu_destroy(vcpu);
 491                 xa_erase(&kvm->vcpu_array, i);
 492         }
 493
 494         atomic_set(&kvm->online_vcpus, 0);
 495 }
 496 EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
 497
 498 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 499 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 500 {
 501         return container_of(mn, struct kvm, mmu_notifier);
 502 }
 503
 504 typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 505
 506 typedef void (*on_lock_fn_t)(struct kvm *kvm);
 507
 508 struct kvm_mmu_notifier_range {
 509         /*
 510          * 64-bit addresses, as KVM notifiers can operate on host virtual
 511          * addresses (unsigned long) and guest physical addresses (64-bit).
 512          */
 513         u64 start;
 514         u64 end;
 515         union kvm_mmu_notifier_arg arg;
 516         gfn_handler_t handler;
 517         on_lock_fn_t on_lock;
 518         bool flush_on_ret;
 519         bool may_block;
 520 };
 521
 522 /*
 523  * The inner-most helper returns a tuple containing the return value from the
 524  * arch- and action-specific handler, plus a flag indicating whether or not at
 525  * least one memslot was found, i.e. if the handler found guest memory.
 526  *
 527  * Note, most notifiers are averse to booleans, so even though KVM tracks the
 528  * return from arch code as a bool, outer helpers will cast it to an int. :-(
 529  */
 530 typedef struct kvm_mmu_notifier_return {
 531         bool ret;
 532         bool found_memslot;
 533 } kvm_mn_ret_t;
 534
 535 /*
 536  * Use a dedicated stub instead of NULL to indicate that there is no callback
 537  * function/handler.  The compiler technically can't guarantee that a real
 538  * function will have a non-zero address, and so it will generate code to
 539  * check for !NULL, whereas comparing against a stub will be elided at compile
 540  * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
 541  */
 542 static void kvm_null_fn(void)
 543 {
 544
 545 }
 546 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 547
 548 /* Iterate over each memslot intersecting [start, last] (inclusive) range */
 549 #define kvm_for_each_memslot_in_hva_range(node, slots, start, last)          \
 550         for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
 551              node;                                                           \
 552              node = interval_tree_iter_next(node, start, last))      \
 553
 554 static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
 555                                                            const struct kvm_mmu_notifier_range *range)
 556 {
 557         struct kvm_mmu_notifier_return r = {
 558                 .ret = false,
 559                 .found_memslot = false,
 560         };
 561         struct kvm_gfn_range gfn_range;
 562         struct kvm_memory_slot *slot;
 563         struct kvm_memslots *slots;
 564         int i, idx;
 565
 566         if (WARN_ON_ONCE(range->end <= range->start))
 567                 return r;
 568
 569         /* A null handler is allowed if and only if on_lock() is provided. */
 570         if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
 571                          IS_KVM_NULL_FN(range->handler)))
 572                 return r;
 573
 574         idx = srcu_read_lock(&kvm->srcu);
 575
 576         for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
 577                 struct interval_tree_node *node;
 578
 579                 slots = __kvm_memslots(kvm, i);
 580                 kvm_for_each_memslot_in_hva_range(node, slots,
 581                                                   range->start, range->end - 1) {
 582                         unsigned long hva_start, hva_end;
 583
 584                         slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
 585                         hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
 586                         hva_end = min_t(unsigned long, range->end,
 587                                         slot->userspace_addr + (slot->npages << PAGE_SHIFT));
 588
 589                         /*
 590                          * To optimize for the likely case where the address
 591                          * range is covered by zero or one memslots, don't
 592                          * bother making these conditional (to avoid writes on
 593                          * the second or later invocation of the handler).
 594                          */
 595                         gfn_range.arg = range->arg;
 596                         gfn_range.may_block = range->may_block;
 597
 598                         /*
 599                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
 600                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 601                          */
 602                         gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
 603                         gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 604                         gfn_range.slot = slot;
 605
 606                         if (!r.found_memslot) {
 607                                 r.found_memslot = true;
 608                                 KVM_MMU_LOCK(kvm);
 609                                 if (!IS_KVM_NULL_FN(range->on_lock))
 610                                         range->on_lock(kvm);
 611
 612                                 if (IS_KVM_NULL_FN(range->handler))
 613                                         goto mmu_unlock;
 614                         }
 615                         r.ret |= range->handler(kvm, &gfn_range);
 616                 }
 617         }
 618
 619         if (range->flush_on_ret && r.ret)
 620                 kvm_flush_remote_tlbs(kvm);
 621
 622 mmu_unlock:
 623         if (r.found_memslot)
 624                 KVM_MMU_UNLOCK(kvm);
 625
 626         srcu_read_unlock(&kvm->srcu, idx);
 627
 628         return r;
 629 }
 630
 631 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 632                                                 unsigned long start,
 633                                                 unsigned long end,
 634                                                 gfn_handler_t handler,
 635                                                 bool flush_on_ret)
 636 {
 637         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 638         const struct kvm_mmu_notifier_range range = {
 639                 .start          = start,
 640                 .end            = end,
 641                 .handler        = handler,
 642                 .on_lock        = (void *)kvm_null_fn,
 643                 .flush_on_ret   = flush_on_ret,
 644                 .may_block      = false,
 645         };
 646
 647         return __kvm_handle_hva_range(kvm, &range).ret;
 648 }
 649
 650 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
 651                                                          unsigned long start,
 652                                                          unsigned long end,
 653                                                          gfn_handler_t handler)
 654 {
 655         return kvm_handle_hva_range(mn, start, end, handler, false);
 656 }
 657
 658 void kvm_mmu_invalidate_begin(struct kvm *kvm)
 659 {
 660         lockdep_assert_held_write(&kvm->mmu_lock);
 661         /*
 662          * The count increase must become visible at unlock time as no
 663          * spte can be established without taking the mmu_lock and
 664          * count is also read inside the mmu_lock critical section.
 665          */
 666         kvm->mmu_invalidate_in_progress++;
 667
 668         if (likely(kvm->mmu_invalidate_in_progress == 1)) {
 669                 kvm->mmu_invalidate_range_start = INVALID_GPA;
 670                 kvm->mmu_invalidate_range_end = INVALID_GPA;
 671         }
 672 }
 673
 674 void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
 675 {
 676         lockdep_assert_held_write(&kvm->mmu_lock);
 677
 678         WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
 679
 680         if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
 681                 kvm->mmu_invalidate_range_start = start;
 682                 kvm->mmu_invalidate_range_end = end;
 683         } else {
 684                 /*
 685                  * Fully tracking multiple concurrent ranges has diminishing
 686                  * returns. Keep things simple and just find the minimal range
 687                  * which includes the current and new ranges. As there won't be
 688                  * enough information to subtract a range after its invalidate
 689                  * completes, any ranges invalidated concurrently will
 690                  * accumulate and persist until all outstanding invalidates
 691                  * complete.
 692                  */
 693                 kvm->mmu_invalidate_range_start =
 694                         min(kvm->mmu_invalidate_range_start, start);
 695                 kvm->mmu_invalidate_range_end =
 696                         max(kvm->mmu_invalidate_range_end, end);
 697         }
 698 }
 699
 700 bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 701 {
 702         kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
 703         return kvm_unmap_gfn_range(kvm, range);
 704 }
 705
 706 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 707                                         const struct mmu_notifier_range *range)
 708 {
 709         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 710         const struct kvm_mmu_notifier_range hva_range = {
 711                 .start          = range->start,
 712                 .end            = range->end,
 713                 .handler        = kvm_mmu_unmap_gfn_range,
 714                 .on_lock        = kvm_mmu_invalidate_begin,
 715                 .flush_on_ret   = true,
 716                 .may_block      = mmu_notifier_range_blockable(range),
 717         };
 718
 719         trace_kvm_unmap_hva_range(range->start, range->end);
 720
 721         /*
 722          * Prevent memslot modification between range_start() and range_end()
 723          * so that conditionally locking provides the same result in both
 724          * functions.  Without that guarantee, the mmu_invalidate_in_progress
 725          * adjustments will be imbalanced.
 726          *
 727          * Pairs with the decrement in range_end().
 728          */
 729         spin_lock(&kvm->mn_invalidate_lock);
 730         kvm->mn_active_invalidate_count++;
 731         spin_unlock(&kvm->mn_invalidate_lock);
 732
 733         /*
 734          * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
 735          * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
 736          * each cache's lock.  There are relatively few caches in existence at
 737          * any given time, and the caches themselves can check for hva overlap,
 738          * i.e. don't need to rely on memslot overlap checks for performance.
 739          * Because this runs without holding mmu_lock, the pfn caches must use
 740          * mn_active_invalidate_count (see above) instead of
 741          * mmu_invalidate_in_progress.
 742          */
 743         gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
 744
 745         /*
 746          * If one or more memslots were found and thus zapped, notify arch code
 747          * that guest memory has been reclaimed.  This needs to be done *after*
 748          * dropping mmu_lock, as x86's reclaim path is slooooow.
 749          */
 750         if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
 751                 kvm_arch_guest_memory_reclaimed(kvm);
 752
 753         return 0;
 754 }
 755
 756 void kvm_mmu_invalidate_end(struct kvm *kvm)
 757 {
 758         lockdep_assert_held_write(&kvm->mmu_lock);
 759
 760         /*
 761          * This sequence increase will notify the kvm page fault that
 762          * the page that is going to be mapped in the spte could have
 763          * been freed.
 764          */
 765         kvm->mmu_invalidate_seq++;
 766         smp_wmb();
 767         /*
 768          * The above sequence increase must be visible before the
 769          * below count decrease, which is ensured by the smp_wmb above
 770          * in conjunction with the smp_rmb in mmu_invalidate_retry().
 771          */
 772         kvm->mmu_invalidate_in_progress--;
 773         KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
 774
 775         /*
 776          * Assert that at least one range was added between start() and end().
 777          * Not adding a range isn't fatal, but it is a KVM bug.
 778          */
 779         WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
 780 }
 781
 782 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 783                                         const struct mmu_notifier_range *range)
 784 {
 785         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 786         const struct kvm_mmu_notifier_range hva_range = {
 787                 .start          = range->start,
 788                 .end            = range->end,
 789                 .handler        = (void *)kvm_null_fn,
 790                 .on_lock        = kvm_mmu_invalidate_end,
 791                 .flush_on_ret   = false,
 792                 .may_block      = mmu_notifier_range_blockable(range),
 793         };
 794         bool wake;
 795
 796         __kvm_handle_hva_range(kvm, &hva_range);
 797
 798         /* Pairs with the increment in range_start(). */
 799         spin_lock(&kvm->mn_invalidate_lock);
 800         if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
 801                 --kvm->mn_active_invalidate_count;
 802         wake = !kvm->mn_active_invalidate_count;
 803         spin_unlock(&kvm->mn_invalidate_lock);
 804
 805         /*
 806          * There can only be one waiter, since the wait happens under
 807          * slots_lock.
 808          */
 809         if (wake)
 810                 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
 811 }
 812
 813 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 814                                               struct mm_struct *mm,
 815                                               unsigned long start,
 816                                               unsigned long end)
 817 {
 818         trace_kvm_age_hva(start, end);
 819
 820         return kvm_handle_hva_range(mn, start, end, kvm_age_gfn,
 821                                     !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
 822 }
 823
 824 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 825                                         struct mm_struct *mm,
 826                                         unsigned long start,
 827                                         unsigned long end)
 828 {
 829         trace_kvm_age_hva(start, end);
 830
 831         /*
 832          * Even though we do not flush TLB, this will still adversely
 833          * affect performance on pre-Haswell Intel EPT, where there is
 834          * no EPT Access Bit to clear so that we have to tear down EPT
 835          * tables instead. If we find this unacceptable, we can always
 836          * add a parameter to kvm_age_hva so that it effectively doesn't
 837          * do anything on clear_young.
 838          *
 839          * Also note that currently we never issue secondary TLB flushes
 840          * from clear_young, leaving this job up to the regular system
 841          * cadence. If we find this inaccurate, we might come up with a
 842          * more sophisticated heuristic later.
 843          */
 844         return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 845 }
 846
 847 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 848                                        struct mm_struct *mm,
 849                                        unsigned long address)
 850 {
 851         trace_kvm_test_age_hva(address);
 852
 853         return kvm_handle_hva_range_no_flush(mn, address, address + 1,
 854                                              kvm_test_age_gfn);
 855 }
 856
 857 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 858                                      struct mm_struct *mm)
 859 {
 860         struct kvm *kvm = mmu_notifier_to_kvm(mn);
 861         int idx;
 862
 863         idx = srcu_read_lock(&kvm->srcu);
 864         kvm_flush_shadow_all(kvm);
 865         srcu_read_unlock(&kvm->srcu, idx);
 866 }
 867
 868 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 869         .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
 870         .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
 871         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
 872         .clear_young            = kvm_mmu_notifier_clear_young,
 873         .test_young             = kvm_mmu_notifier_test_young,
 874         .release                = kvm_mmu_notifier_release,
 875 };
 876
 877 static int kvm_init_mmu_notifier(struct kvm *kvm)
 878 {
 879         kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 880         return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 881 }
 882
 883 #else  /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
 884
 885 static int kvm_init_mmu_notifier(struct kvm *kvm)
 886 {
 887         return 0;
 888 }
 889
 890 #endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
 891
 892 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 893 static int kvm_pm_notifier_call(struct notifier_block *bl,
 894                                 unsigned long state,
 895                                 void *unused)
 896 {
 897         struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
 898
 899         return kvm_arch_pm_notifier(kvm, state);
 900 }
 901
 902 static void kvm_init_pm_notifier(struct kvm *kvm)
 903 {
 904         kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
 905         /* Suspend KVM before we suspend ftrace, RCU, etc. */
 906         kvm->pm_notifier.priority = INT_MAX;
 907         register_pm_notifier(&kvm->pm_notifier);
 908 }
 909
 910 static void kvm_destroy_pm_notifier(struct kvm *kvm)
 911 {
 912         unregister_pm_notifier(&kvm->pm_notifier);
 913 }
 914 #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
 915 static void kvm_init_pm_notifier(struct kvm *kvm)
 916 {
 917 }
 918
 919 static void kvm_destroy_pm_notifier(struct kvm *kvm)
 920 {
 921 }
 922 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
 923
 924 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 925 {
 926         if (!memslot->dirty_bitmap)
 927                 return;
 928
 929         vfree(memslot->dirty_bitmap);
 930         memslot->dirty_bitmap = NULL;
 931 }
 932
 933 /* This does not remove the slot from struct kvm_memslots data structures */
 934 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 935 {
 936         if (slot->flags & KVM_MEM_GUEST_MEMFD)
 937                 kvm_gmem_unbind(slot);
 938
 939         kvm_destroy_dirty_bitmap(slot);
 940
 941         kvm_arch_free_memslot(kvm, slot);
 942
 943         kfree(slot);
 944 }
 945
 946 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
 947 {
 948         struct hlist_node *idnode;
 949         struct kvm_memory_slot *memslot;
 950         int bkt;
 951
 952         /*
 953          * The same memslot objects live in both active and inactive sets,
 954          * arbitrarily free using index '1' so the second invocation of this
 955          * function isn't operating over a structure with dangling pointers
 956          * (even though this function isn't actually touching them).
 957          */
 958         if (!slots->node_idx)
 959                 return;
 960
 961         hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
 962                 kvm_free_memslot(kvm, memslot);
 963 }
 964
 965 static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
 966 {
 967         switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
 968         case KVM_STATS_TYPE_INSTANT:
 969                 return 0444;
 970         case KVM_STATS_TYPE_CUMULATIVE:
 971         case KVM_STATS_TYPE_PEAK:
 972         default:
 973                 return 0644;
 974         }
 975 }
 976
 977
 978 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
 979 {
 980         int i;
 981         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
 982                                       kvm_vcpu_stats_header.num_desc;
 983
 984         if (IS_ERR(kvm->debugfs_dentry))
 985                 return;
 986
 987         debugfs_remove_recursive(kvm->debugfs_dentry);
 988
 989         if (kvm->debugfs_stat_data) {
 990                 for (i = 0; i < kvm_debugfs_num_entries; i++)
 991                         kfree(kvm->debugfs_stat_data[i]);
 992                 kfree(kvm->debugfs_stat_data);
 993         }
 994 }
 995
 996 static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
 997 {
 998         static DEFINE_MUTEX(kvm_debugfs_lock);
 999         struct dentry *dent;
1000         char dir_name[ITOA_MAX_LEN * 2];
1001         struct kvm_stat_data *stat_data;
1002         const struct _kvm_stats_desc *pdesc;
1003         int i, ret = -ENOMEM;
1004         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1005                                       kvm_vcpu_stats_header.num_desc;
1006
1007         if (!debugfs_initialized())
1008                 return 0;
1009
1010         snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
1011         mutex_lock(&kvm_debugfs_lock);
1012         dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
1013         if (dent) {
1014                 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1015                 dput(dent);
1016                 mutex_unlock(&kvm_debugfs_lock);
1017                 return 0;
1018         }
1019         dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
1020         mutex_unlock(&kvm_debugfs_lock);
1021         if (IS_ERR(dent))
1022                 return 0;
1023
1024         kvm->debugfs_dentry = dent;
1025         kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
1026                                          sizeof(*kvm->debugfs_stat_data),
1027                                          GFP_KERNEL_ACCOUNT);
1028         if (!kvm->debugfs_stat_data)
1029                 goto out_err;
1030
1031         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
1032                 pdesc = &kvm_vm_stats_desc[i];
1033                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1034                 if (!stat_data)
1035                         goto out_err;
1036
1037                 stat_data->kvm = kvm;
1038                 stat_data->desc = pdesc;
1039                 stat_data->kind = KVM_STAT_VM;
1040                 kvm->debugfs_stat_data[i] = stat_data;
1041                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1042                                     kvm->debugfs_dentry, stat_data,
1043                                     &stat_fops_per_vm);
1044         }
1045
1046         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
1047                 pdesc = &kvm_vcpu_stats_desc[i];
1048                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1049                 if (!stat_data)
1050                         goto out_err;
1051
1052                 stat_data->kvm = kvm;
1053                 stat_data->desc = pdesc;
1054                 stat_data->kind = KVM_STAT_VCPU;
1055                 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1056                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
1057                                     kvm->debugfs_dentry, stat_data,
1058                                     &stat_fops_per_vm);
1059         }
1060
1061         kvm_arch_create_vm_debugfs(kvm);
1062         return 0;
1063 out_err:
1064         kvm_destroy_vm_debugfs(kvm);
1065         return ret;
1066 }
1067
1068 /*
1069  * Called after the VM is otherwise initialized, but just before adding it to
1070  * the vm_list.
1071  */
1072 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1073 {
1074         return 0;
1075 }
1076
1077 /*
1078  * Called just after removing the VM from the vm_list, but before doing any
1079  * other destruction.
1080  */
1081 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1082 {
1083 }
1084
1085 /*
1086  * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1087  * be setup already, so we can create arch-specific debugfs entries under it.
1088  * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1089  * a per-arch destroy interface is not needed.
1090  */
1091 void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1092 {
1093 }
1094
1095 static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
1096 {
1097         struct kvm *kvm = kvm_arch_alloc_vm();
1098         struct kvm_memslots *slots;
1099         int r, i, j;
1100
1101         if (!kvm)
1102                 return ERR_PTR(-ENOMEM);
1103
1104         KVM_MMU_LOCK_INIT(kvm);
1105         mmgrab(current->mm);
1106         kvm->mm = current->mm;
1107         kvm_eventfd_init(kvm);
1108         mutex_init(&kvm->lock);
1109         mutex_init(&kvm->irq_lock);
1110         mutex_init(&kvm->slots_lock);
1111         mutex_init(&kvm->slots_arch_lock);
1112         spin_lock_init(&kvm->mn_invalidate_lock);
1113         rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1114         xa_init(&kvm->vcpu_array);
1115 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1116         xa_init(&kvm->mem_attr_array);
1117 #endif
1118
1119         INIT_LIST_HEAD(&kvm->gpc_list);
1120         spin_lock_init(&kvm->gpc_lock);
1121
1122         INIT_LIST_HEAD(&kvm->devices);
1123         kvm->max_vcpus = KVM_MAX_VCPUS;
1124
1125         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1126
1127         /*
1128          * Force subsequent debugfs file creations to fail if the VM directory
1129          * is not created (by kvm_create_vm_debugfs()).
1130          */
1131         kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1132
1133         snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
1134                  task_pid_nr(current));
1135
1136         r = -ENOMEM;
1137         if (init_srcu_struct(&kvm->srcu))
1138                 goto out_err_no_srcu;
1139         if (init_srcu_struct(&kvm->irq_srcu))
1140                 goto out_err_no_irq_srcu;
1141
1142         r = kvm_init_irq_routing(kvm);
1143         if (r)
1144                 goto out_err_no_irq_routing;
1145
1146         refcount_set(&kvm->users_count, 1);
1147
1148         for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
1149                 for (j = 0; j < 2; j++) {
1150                         slots = &kvm->__memslots[i][j];
1151
1152                         atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
1153                         slots->hva_tree = RB_ROOT_CACHED;
1154                         slots->gfn_tree = RB_ROOT;
1155                         hash_init(slots->id_hash);
1156                         slots->node_idx = j;
1157
1158                         /* Generations must be different for each address space. */
1159                         slots->generation = i;
1160                 }
1161
1162                 rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
1163         }
1164
1165         r = -ENOMEM;
1166         for (i = 0; i < KVM_NR_BUSES; i++) {
1167                 rcu_assign_pointer(kvm->buses[i],
1168                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1169                 if (!kvm->buses[i])
1170                         goto out_err_no_arch_destroy_vm;
1171         }
1172
1173         r = kvm_arch_init_vm(kvm, type);
1174         if (r)
1175                 goto out_err_no_arch_destroy_vm;
1176
1177         r = kvm_enable_virtualization();
1178         if (r)
1179                 goto out_err_no_disable;
1180
1181 #ifdef CONFIG_HAVE_KVM_IRQCHIP
1182         INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1183 #endif
1184
1185         r = kvm_init_mmu_notifier(kvm);
1186         if (r)
1187                 goto out_err_no_mmu_notifier;
1188
1189         r = kvm_coalesced_mmio_init(kvm);
1190         if (r < 0)
1191                 goto out_no_coalesced_mmio;
1192
1193         r = kvm_create_vm_debugfs(kvm, fdname);
1194         if (r)
1195                 goto out_err_no_debugfs;
1196
1197         r = kvm_arch_post_init_vm(kvm);
1198         if (r)
1199                 goto out_err;
1200
1201         mutex_lock(&kvm_lock);
1202         list_add(&kvm->vm_list, &vm_list);
1203         mutex_unlock(&kvm_lock);
1204
1205         preempt_notifier_inc();
1206         kvm_init_pm_notifier(kvm);
1207
1208         return kvm;
1209
1210 out_err:
1211         kvm_destroy_vm_debugfs(kvm);
1212 out_err_no_debugfs:
1213         kvm_coalesced_mmio_free(kvm);
1214 out_no_coalesced_mmio:
1215 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1216         if (kvm->mmu_notifier.ops)
1217                 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1218 #endif
1219 out_err_no_mmu_notifier:
1220         kvm_disable_virtualization();
1221 out_err_no_disable:
1222         kvm_arch_destroy_vm(kvm);
1223 out_err_no_arch_destroy_vm:
1224         WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1225         for (i = 0; i < KVM_NR_BUSES; i++)
1226                 kfree(kvm_get_bus(kvm, i));
1227         kvm_free_irq_routing(kvm);
1228 out_err_no_irq_routing:
1229         cleanup_srcu_struct(&kvm->irq_srcu);
1230 out_err_no_irq_srcu:
1231         cleanup_srcu_struct(&kvm->srcu);
1232 out_err_no_srcu:
1233         kvm_arch_free_vm(kvm);
1234         mmdrop(current->mm);
1235         return ERR_PTR(r);
1236 }
1237
1238 static void kvm_destroy_devices(struct kvm *kvm)
1239 {
1240         struct kvm_device *dev, *tmp;
1241
1242         /*
1243          * We do not need to take the kvm->lock here, because nobody else
1244          * has a reference to the struct kvm at this point and therefore
1245          * cannot access the devices list anyhow.
1246          *
1247          * The device list is generally managed as an rculist, but list_del()
1248          * is used intentionally here. If a bug in KVM introduced a reader that
1249          * was not backed by a reference on the kvm struct, the hope is that
1250          * it'd consume the poisoned forward pointer instead of suffering a
1251          * use-after-free, even though this cannot be guaranteed.
1252          */
1253         list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1254                 list_del(&dev->vm_node);
1255                 dev->ops->destroy(dev);
1256         }
1257 }
1258
1259 static void kvm_destroy_vm(struct kvm *kvm)
1260 {
1261         int i;
1262         struct mm_struct *mm = kvm->mm;
1263
1264         kvm_destroy_pm_notifier(kvm);
1265         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1266         kvm_destroy_vm_debugfs(kvm);
1267         kvm_arch_sync_events(kvm);
1268         mutex_lock(&kvm_lock);
1269         list_del(&kvm->vm_list);
1270         mutex_unlock(&kvm_lock);
1271         kvm_arch_pre_destroy_vm(kvm);
1272
1273         kvm_free_irq_routing(kvm);
1274         for (i = 0; i < KVM_NR_BUSES; i++) {
1275                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1276
1277                 if (bus)
1278                         kvm_io_bus_destroy(bus);
1279                 kvm->buses[i] = NULL;
1280         }
1281         kvm_coalesced_mmio_free(kvm);
1282 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
1283         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1284         /*
1285          * At this point, pending calls to invalidate_range_start()
1286          * have completed but no more MMU notifiers will run, so
1287          * mn_active_invalidate_count may remain unbalanced.
1288          * No threads can be waiting in kvm_swap_active_memslots() as the
1289          * last reference on KVM has been dropped, but freeing
1290          * memslots would deadlock without this manual intervention.
1291          *
1292          * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
1293          * notifier between a start() and end(), then there shouldn't be any
1294          * in-progress invalidations.
1295          */
1296         WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1297         if (kvm->mn_active_invalidate_count)
1298                 kvm->mn_active_invalidate_count = 0;
1299         else
1300                 WARN_ON(kvm->mmu_invalidate_in_progress);
1301 #else
1302         kvm_flush_shadow_all(kvm);
1303 #endif
1304         kvm_arch_destroy_vm(kvm);
1305         kvm_destroy_devices(kvm);
1306         for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
1307                 kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
1308                 kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
1309         }
1310         cleanup_srcu_struct(&kvm->irq_srcu);
1311         cleanup_srcu_struct(&kvm->srcu);
1312 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
1313         xa_destroy(&kvm->mem_attr_array);
1314 #endif
1315         kvm_arch_free_vm(kvm);
1316         preempt_notifier_dec();
1317         kvm_disable_virtualization();
1318         mmdrop(mm);
1319 }
1320
1321 void kvm_get_kvm(struct kvm *kvm)
1322 {
1323         refcount_inc(&kvm->users_count);
1324 }
1325 EXPORT_SYMBOL_GPL(kvm_get_kvm);
1326
1327 /*
1328  * Make sure the vm is not during destruction, which is a safe version of
1329  * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1330  */
1331 bool kvm_get_kvm_safe(struct kvm *kvm)
1332 {
1333         return refcount_inc_not_zero(&kvm->users_count);
1334 }
1335 EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1336
1337 void kvm_put_kvm(struct kvm *kvm)
1338 {
1339         if (refcount_dec_and_test(&kvm->users_count))
1340                 kvm_destroy_vm(kvm);
1341 }
1342 EXPORT_SYMBOL_GPL(kvm_put_kvm);
1343
1344 /*
1345  * Used to put a reference that was taken on behalf of an object associated
1346  * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1347  * of the new file descriptor fails and the reference cannot be transferred to
1348  * its final owner.  In such cases, the caller is still actively using @kvm and
1349  * will fail miserably if the refcount unexpectedly hits zero.
1350  */
1351 void kvm_put_kvm_no_destroy(struct kvm *kvm)
1352 {
1353         WARN_ON(refcount_dec_and_test(&kvm->users_count));
1354 }
1355 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1356
1357 static int kvm_vm_release(struct inode *inode, struct file *filp)
1358 {
1359         struct kvm *kvm = filp->private_data;
1360
1361         kvm_irqfd_release(kvm);
1362
1363         kvm_put_kvm(kvm);
1364         return 0;
1365 }
1366
1367 /*
1368  * Allocation size is twice as large as the actual dirty bitmap size.
1369  * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1370  */
1371 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1372 {
1373         unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1374
1375         memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1376         if (!memslot->dirty_bitmap)
1377                 return -ENOMEM;
1378
1379         return 0;
1380 }
1381
1382 static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
1383 {
1384         struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1385         int node_idx_inactive = active->node_idx ^ 1;
1386
1387         return &kvm->__memslots[as_id][node_idx_inactive];
1388 }
1389
1390 /*
1391  * Helper to get the address space ID when one of memslot pointers may be NULL.
1392  * This also serves as a sanity that at least one of the pointers is non-NULL,
1393  * and that their address space IDs don't diverge.
1394  */
1395 static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1396                                   struct kvm_memory_slot *b)
1397 {
1398         if (WARN_ON_ONCE(!a && !b))
1399                 return 0;
1400
1401         if (!a)
1402                 return b->as_id;
1403         if (!b)
1404                 return a->as_id;
1405
1406         WARN_ON_ONCE(a->as_id != b->as_id);
1407         return a->as_id;
1408 }
1409
1410 static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1411                                 struct kvm_memory_slot *slot)
1412 {
1413         struct rb_root *gfn_tree = &slots->gfn_tree;
1414         struct rb_node **node, *parent;
1415         int idx = slots->node_idx;
1416
1417         parent = NULL;
1418         for (node = &gfn_tree->rb_node; *node; ) {
1419                 struct kvm_memory_slot *tmp;
1420
1421                 tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
1422                 parent = *node;
1423                 if (slot->base_gfn < tmp->base_gfn)
1424                         node = &(*node)->rb_left;
1425                 else if (slot->base_gfn > tmp->base_gfn)
1426                         node = &(*node)->rb_right;
1427                 else
1428                         BUG();
1429         }
1430
1431         rb_link_node(&slot->gfn_node[idx], parent, node);
1432         rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1433 }
1434
1435 static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1436                                struct kvm_memory_slot *slot)
1437 {
1438         rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1439 }
1440
1441 static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1442                                  struct kvm_memory_slot *old,
1443                                  struct kvm_memory_slot *new)
1444 {
1445         int idx = slots->node_idx;
1446
1447         WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1448
1449         rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
1450                         &slots->gfn_tree);
1451 }
1452
1453 /*
1454  * Replace @old with @new in the inactive memslots.
1455  *
1456  * With NULL @old this simply adds @new.
1457  * With NULL @new this simply removes @old.
1458  *
1459  * If @new is non-NULL its hva_node[slots_idx] range has to be set
1460  * appropriately.
1461  */
1462 static void kvm_replace_memslot(struct kvm *kvm,
1463                                 struct kvm_memory_slot *old,
1464                                 struct kvm_memory_slot *new)
1465 {
1466         int as_id = kvm_memslots_get_as_id(old, new);
1467         struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1468         int idx = slots->node_idx;
1469
1470         if (old) {
1471                 hash_del(&old->id_node[idx]);
1472                 interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
1473
1474                 if ((long)old == atomic_long_read(&slots->last_used_slot))
1475                         atomic_long_set(&slots->last_used_slot, (long)new);
1476
1477                 if (!new) {
1478                         kvm_erase_gfn_node(slots, old);
1479                         return;
1480                 }
1481         }
1482
1483         /*
1484          * Initialize @new's hva range.  Do this even when replacing an @old
1485          * slot, kvm_copy_memslot() deliberately does not touch node data.
1486          */
1487         new->hva_node[idx].start = new->userspace_addr;
1488         new->hva_node[idx].last = new->userspace_addr +
1489                                   (new->npages << PAGE_SHIFT) - 1;
1490
1491         /*
1492          * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
1493          * hva_node needs to be swapped with remove+insert even though hva can't
1494          * change when replacing an existing slot.
1495          */
1496         hash_add(slots->id_hash, &new->id_node[idx], new->id);
1497         interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
1498
1499         /*
1500          * If the memslot gfn is unchanged, rb_replace_node() can be used to
1501          * switch the node in the gfn tree instead of removing the old and
1502          * inserting the new as two separate operations. Replacement is a
1503          * single O(1) operation versus two O(log(n)) operations for
1504          * remove+insert.
1505          */
1506         if (old && old->base_gfn == new->base_gfn) {
1507                 kvm_replace_gfn_node(slots, old, new);
1508         } else {
1509                 if (old)
1510                         kvm_erase_gfn_node(slots, old);
1511                 kvm_insert_gfn_node(slots, new);
1512         }
1513 }
1514
1515 /*
1516  * Flags that do not access any of the extra space of struct
1517  * kvm_userspace_memory_region2.  KVM_SET_USER_MEMORY_REGION_V1_FLAGS
1518  * only allows these.
1519  */
1520 #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
1521         (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
1522
1523 static int check_memory_region_flags(struct kvm *kvm,
1524                                      const struct kvm_userspace_memory_region2 *mem)
1525 {
1526         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1527
1528         if (kvm_arch_has_private_mem(kvm))
1529                 valid_flags |= KVM_MEM_GUEST_MEMFD;
1530
1531         /* Dirty logging private memory is not currently supported. */
1532         if (mem->flags & KVM_MEM_GUEST_MEMFD)
1533                 valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
1534
1535         /*
1536          * GUEST_MEMFD is incompatible with read-only memslots, as writes to
1537          * read-only memslots have emulated MMIO, not page fault, semantics,
1538          * and KVM doesn't allow emulated MMIO for private memory.
1539          */
1540         if (kvm_arch_has_readonly_mem(kvm) &&
1541             !(mem->flags & KVM_MEM_GUEST_MEMFD))
1542                 valid_flags |= KVM_MEM_READONLY;
1543
1544         if (mem->flags & ~valid_flags)
1545                 return -EINVAL;
1546
1547         return 0;
1548 }
1549
1550 static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
1551 {
1552         struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1553
1554         /* Grab the generation from the activate memslots. */
1555         u64 gen = __kvm_memslots(kvm, as_id)->generation;
1556
1557         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1558         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1559
1560         /*
1561          * Do not store the new memslots while there are invalidations in
1562          * progress, otherwise the locking in invalidate_range_start and
1563          * invalidate_range_end will be unbalanced.
1564          */
1565         spin_lock(&kvm->mn_invalidate_lock);
1566         prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1567         while (kvm->mn_active_invalidate_count) {
1568                 set_current_state(TASK_UNINTERRUPTIBLE);
1569                 spin_unlock(&kvm->mn_invalidate_lock);
1570                 schedule();
1571                 spin_lock(&kvm->mn_invalidate_lock);
1572         }
1573         finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1574         rcu_assign_pointer(kvm->memslots[as_id], slots);
1575         spin_unlock(&kvm->mn_invalidate_lock);
1576
1577         /*
1578          * Acquired in kvm_set_memslot. Must be released before synchronize
1579          * SRCU below in order to avoid deadlock with another thread
1580          * acquiring the slots_arch_lock in an srcu critical section.
1581          */
1582         mutex_unlock(&kvm->slots_arch_lock);
1583
1584         synchronize_srcu_expedited(&kvm->srcu);
1585
1586         /*
1587          * Increment the new memslot generation a second time, dropping the
1588          * update in-progress flag and incrementing the generation based on
1589          * the number of address spaces.  This provides a unique and easily
1590          * identifiable generation number while the memslots are in flux.
1591          */
1592         gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1593
1594         /*
1595          * Generations must be unique even across address spaces.  We do not need
1596          * a global counter for that, instead the generation space is evenly split
1597          * across address spaces.  For example, with two address spaces, address
1598          * space 0 will use generations 0, 2, 4, ... while address space 1 will
1599          * use generations 1, 3, 5, ...
1600          */
1601         gen += kvm_arch_nr_memslot_as_ids(kvm);
1602
1603         kvm_arch_memslots_updated(kvm, gen);
1604
1605         slots->generation = gen;
1606 }
1607
1608 static int kvm_prepare_memory_region(struct kvm *kvm,
1609                                      const struct kvm_memory_slot *old,
1610                                      struct kvm_memory_slot *new,
1611                                      enum kvm_mr_change change)
1612 {
1613         int r;
1614
1615         /*
1616          * If dirty logging is disabled, nullify the bitmap; the old bitmap
1617          * will be freed on "commit".  If logging is enabled in both old and
1618          * new, reuse the existing bitmap.  If logging is enabled only in the
1619          * new and KVM isn't using a ring buffer, allocate and initialize a
1620          * new bitmap.
1621          */
1622         if (change != KVM_MR_DELETE) {
1623                 if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1624                         new->dirty_bitmap = NULL;
1625                 else if (old && old->dirty_bitmap)
1626                         new->dirty_bitmap = old->dirty_bitmap;
1627                 else if (kvm_use_dirty_bitmap(kvm)) {
1628                         r = kvm_alloc_dirty_bitmap(new);
1629                         if (r)
1630                                 return r;
1631
1632                         if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1633                                 bitmap_set(new->dirty_bitmap, 0, new->npages);
1634                 }
1635         }
1636
1637         r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1638
1639         /* Free the bitmap on failure if it was allocated above. */
1640         if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
1641                 kvm_destroy_dirty_bitmap(new);
1642
1643         return r;
1644 }
1645
1646 static void kvm_commit_memory_region(struct kvm *kvm,
1647                                      struct kvm_memory_slot *old,
1648                                      const struct kvm_memory_slot *new,
1649                                      enum kvm_mr_change change)
1650 {
1651         int old_flags = old ? old->flags : 0;
1652         int new_flags = new ? new->flags : 0;
1653         /*
1654          * Update the total number of memslot pages before calling the arch
1655          * hook so that architectures can consume the result directly.
1656          */
1657         if (change == KVM_MR_DELETE)
1658                 kvm->nr_memslot_pages -= old->npages;
1659         else if (change == KVM_MR_CREATE)
1660                 kvm->nr_memslot_pages += new->npages;
1661
1662         if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1663                 int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
1664                 atomic_set(&kvm->nr_memslots_dirty_logging,
1665                            atomic_read(&kvm->nr_memslots_dirty_logging) + change);
1666         }
1667
1668         kvm_arch_commit_memory_region(kvm, old, new, change);
1669
1670         switch (change) {
1671         case KVM_MR_CREATE:
1672                 /* Nothing more to do. */
1673                 break;
1674         case KVM_MR_DELETE:
1675                 /* Free the old memslot and all its metadata. */
1676                 kvm_free_memslot(kvm, old);
1677                 break;
1678         case KVM_MR_MOVE:
1679         case KVM_MR_FLAGS_ONLY:
1680                 /*
1681                  * Free the dirty bitmap as needed; the below check encompasses
1682                  * both the flags and whether a ring buffer is being used)
1683                  */
1684                 if (old->dirty_bitmap && !new->dirty_bitmap)
1685                         kvm_destroy_dirty_bitmap(old);
1686
1687                 /*
1688                  * The final quirk.  Free the detached, old slot, but only its
1689                  * memory, not any metadata.  Metadata, including arch specific
1690                  * data, may be reused by @new.
1691                  */
1692                 kfree(old);
1693                 break;
1694         default:
1695                 BUG();
1696         }
1697 }
1698
1699 /*
1700  * Activate @new, which must be installed in the inactive slots by the caller,
1701  * by swapping the active slots and then propagating @new to @old once @old is
1702  * unreachable and can be safely modified.
1703  *
1704  * With NULL @old this simply adds @new to @active (while swapping the sets).
1705  * With NULL @new this simply removes @old from @active and frees it
1706  * (while also swapping the sets).
1707  */
1708 static void kvm_activate_memslot(struct kvm *kvm,
1709                                  struct kvm_memory_slot *old,
1710                                  struct kvm_memory_slot *new)
1711 {
1712         int as_id = kvm_memslots_get_as_id(old, new);
1713
1714         kvm_swap_active_memslots(kvm, as_id);
1715
1716         /* Propagate the new memslot to the now inactive memslots. */
1717         kvm_replace_memslot(kvm, old, new);
1718 }
1719
1720 static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1721                              const struct kvm_memory_slot *src)
1722 {
1723         dest->base_gfn = src->base_gfn;
1724         dest->npages = src->npages;
1725         dest->dirty_bitmap = src->dirty_bitmap;
1726         dest->arch = src->arch;
1727         dest->userspace_addr = src->userspace_addr;
1728         dest->flags = src->flags;
1729         dest->id = src->id;
1730         dest->as_id = src->as_id;
1731 }
1732
1733 static void kvm_invalidate_memslot(struct kvm *kvm,
1734                                    struct kvm_memory_slot *old,
1735                                    struct kvm_memory_slot *invalid_slot)
1736 {
1737         /*
1738          * Mark the current slot INVALID.  As with all memslot modifications,
1739          * this must be done on an unreachable slot to avoid modifying the
1740          * current slot in the active tree.
1741          */
1742         kvm_copy_memslot(invalid_slot, old);
1743         invalid_slot->flags |= KVM_MEMSLOT_INVALID;
1744         kvm_replace_memslot(kvm, old, invalid_slot);
1745
1746         /*
1747          * Activate the slot that is now marked INVALID, but don't propagate
1748          * the slot to the now inactive slots. The slot is either going to be
1749          * deleted or recreated as a new slot.
1750          */
1751         kvm_swap_active_memslots(kvm, old->as_id);
1752
1753         /*
1754          * From this point no new shadow pages pointing to a deleted, or moved,
1755          * memslot will be created.  Validation of sp->gfn happens in:
1756          *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1757          *      - kvm_is_visible_gfn (mmu_check_root)
1758          */
1759         kvm_arch_flush_shadow_memslot(kvm, old);
1760         kvm_arch_guest_memory_reclaimed(kvm);
1761
1762         /* Was released by kvm_swap_active_memslots(), reacquire. */
1763         mutex_lock(&kvm->slots_arch_lock);
1764
1765         /*
1766          * Copy the arch-specific field of the newly-installed slot back to the
1767          * old slot as the arch data could have changed between releasing
1768          * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1769          * above.  Writers are required to retrieve memslots *after* acquiring
1770          * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1771          */
1772         old->arch = invalid_slot->arch;
1773 }
1774
1775 static void kvm_create_memslot(struct kvm *kvm,
1776                                struct kvm_memory_slot *new)
1777 {
1778         /* Add the new memslot to the inactive set and activate. */
1779         kvm_replace_memslot(kvm, NULL, new);
1780         kvm_activate_memslot(kvm, NULL, new);
1781 }
1782
1783 static void kvm_delete_memslot(struct kvm *kvm,
1784                                struct kvm_memory_slot *old,
1785                                struct kvm_memory_slot *invalid_slot)
1786 {
1787         /*
1788          * Remove the old memslot (in the inactive memslots) by passing NULL as
1789          * the "new" slot, and for the invalid version in the active slots.
1790          */
1791         kvm_replace_memslot(kvm, old, NULL);
1792         kvm_activate_memslot(kvm, invalid_slot, NULL);
1793 }
1794
1795 static void kvm_move_memslot(struct kvm *kvm,
1796                              struct kvm_memory_slot *old,
1797                              struct kvm_memory_slot *new,
1798                              struct kvm_memory_slot *invalid_slot)
1799 {
1800         /*
1801          * Replace the old memslot in the inactive slots, and then swap slots
1802          * and replace the current INVALID with the new as well.
1803          */
1804         kvm_replace_memslot(kvm, old, new);
1805         kvm_activate_memslot(kvm, invalid_slot, new);
1806 }
1807
1808 static void kvm_update_flags_memslot(struct kvm *kvm,
1809                                      struct kvm_memory_slot *old,
1810                                      struct kvm_memory_slot *new)
1811 {
1812         /*
1813          * Similar to the MOVE case, but the slot doesn't need to be zapped as
1814          * an intermediate step. Instead, the old memslot is simply replaced
1815          * with a new, updated copy in both memslot sets.
1816          */
1817         kvm_replace_memslot(kvm, old, new);
1818         kvm_activate_memslot(kvm, old, new);
1819 }
1820
1821 static int kvm_set_memslot(struct kvm *kvm,
1822                            struct kvm_memory_slot *old,
1823                            struct kvm_memory_slot *new,
1824                            enum kvm_mr_change change)
1825 {
1826         struct kvm_memory_slot *invalid_slot;
1827         int r;
1828
1829         /*
1830          * Released in kvm_swap_active_memslots().
1831          *
1832          * Must be held from before the current memslots are copied until after
1833          * the new memslots are installed with rcu_assign_pointer, then
1834          * released before the synchronize srcu in kvm_swap_active_memslots().
1835          *
1836          * When modifying memslots outside of the slots_lock, must be held
1837          * before reading the pointer to the current memslots until after all
1838          * changes to those memslots are complete.
1839          *
1840          * These rules ensure that installing new memslots does not lose
1841          * changes made to the previous memslots.
1842          */
1843         mutex_lock(&kvm->slots_arch_lock);
1844
1845         /*
1846          * Invalidate the old slot if it's being deleted or moved.  This is
1847          * done prior to actually deleting/moving the memslot to allow vCPUs to
1848          * continue running by ensuring there are no mappings or shadow pages
1849          * for the memslot when it is deleted/moved.  Without pre-invalidation
1850          * (and without a lock), a window would exist between effecting the
1851          * delete/move and committing the changes in arch code where KVM or a
1852          * guest could access a non-existent memslot.
1853          *
1854          * Modifications are done on a temporary, unreachable slot.  The old
1855          * slot needs to be preserved in case a later step fails and the
1856          * invalidation needs to be reverted.
1857          */
1858         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1859                 invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1860                 if (!invalid_slot) {
1861                         mutex_unlock(&kvm->slots_arch_lock);
1862                         return -ENOMEM;
1863                 }
1864                 kvm_invalidate_memslot(kvm, old, invalid_slot);
1865         }
1866
1867         r = kvm_prepare_memory_region(kvm, old, new, change);
1868         if (r) {
1869                 /*
1870                  * For DELETE/MOVE, revert the above INVALID change.  No
1871                  * modifications required since the original slot was preserved
1872                  * in the inactive slots.  Changing the active memslots also
1873                  * release slots_arch_lock.
1874                  */
1875                 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1876                         kvm_activate_memslot(kvm, invalid_slot, old);
1877                         kfree(invalid_slot);
1878                 } else {
1879                         mutex_unlock(&kvm->slots_arch_lock);
1880                 }
1881                 return r;
1882         }
1883
1884         /*
1885          * For DELETE and MOVE, the working slot is now active as the INVALID
1886          * version of the old slot.  MOVE is particularly special as it reuses
1887          * the old slot and returns a copy of the old slot (in working_slot).
1888          * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
1889          * old slot is detached but otherwise preserved.
1890          */
1891         if (change == KVM_MR_CREATE)
1892                 kvm_create_memslot(kvm, new);
1893         else if (change == KVM_MR_DELETE)
1894                 kvm_delete_memslot(kvm, old, invalid_slot);
1895         else if (change == KVM_MR_MOVE)
1896                 kvm_move_memslot(kvm, old, new, invalid_slot);
1897         else if (change == KVM_MR_FLAGS_ONLY)
1898                 kvm_update_flags_memslot(kvm, old, new);
1899         else
1900                 BUG();
1901
1902         /* Free the temporary INVALID slot used for DELETE and MOVE. */
1903         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1904                 kfree(invalid_slot);
1905
1906         /*
1907          * No need to refresh new->arch, changes after dropping slots_arch_lock
1908          * will directly hit the final, active memslot.  Architectures are
1909          * responsible for knowing that new->arch may be stale.
1910          */
1911         kvm_commit_memory_region(kvm, old, new, change);
1912
1913         return 0;
1914 }
1915
1916 static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
1917                                       gfn_t start, gfn_t end)
1918 {
1919         struct kvm_memslot_iter iter;
1920
1921         kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1922                 if (iter.slot->id != id)
1923                         return true;
1924         }
1925
1926         return false;
1927 }
1928
1929 /*
1930  * Allocate some memory and give it an address in the guest physical address
1931  * space.
1932  *
1933  * Discontiguous memory is allowed, mostly for framebuffers.
1934  *
1935  * Must be called holding kvm->slots_lock for write.
1936  */
1937 int __kvm_set_memory_region(struct kvm *kvm,
1938                             const struct kvm_userspace_memory_region2 *mem)
1939 {
1940         struct kvm_memory_slot *old, *new;
1941         struct kvm_memslots *slots;
1942         enum kvm_mr_change change;
1943         unsigned long npages;
1944         gfn_t base_gfn;
1945         int as_id, id;
1946         int r;
1947
1948         r = check_memory_region_flags(kvm, mem);
1949         if (r)
1950                 return r;
1951
1952         as_id = mem->slot >> 16;
1953         id = (u16)mem->slot;
1954
1955         /* General sanity checks */
1956         if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1957             (mem->memory_size != (unsigned long)mem->memory_size))
1958                 return -EINVAL;
1959         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1960                 return -EINVAL;
1961         /* We can read the guest memory with __xxx_user() later on. */
1962         if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1963             (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1964              !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1965                         mem->memory_size))
1966                 return -EINVAL;
1967         if (mem->flags & KVM_MEM_GUEST_MEMFD &&
1968             (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
1969              mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
1970                 return -EINVAL;
1971         if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
1972                 return -EINVAL;
1973         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1974                 return -EINVAL;
1975         if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
1976                 return -EINVAL;
1977
1978         slots = __kvm_memslots(kvm, as_id);
1979
1980         /*
1981          * Note, the old memslot (and the pointer itself!) may be invalidated
1982          * and/or destroyed by kvm_set_memslot().
1983          */
1984         old = id_to_memslot(slots, id);
1985
1986         if (!mem->memory_size) {
1987                 if (!old || !old->npages)
1988                         return -EINVAL;
1989
1990                 if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
1991                         return -EIO;
1992
1993                 return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
1994         }
1995
1996         base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
1997         npages = (mem->memory_size >> PAGE_SHIFT);
1998
1999         if (!old || !old->npages) {
2000                 change = KVM_MR_CREATE;
2001
2002                 /*
2003                  * To simplify KVM internals, the total number of pages across
2004                  * all memslots must fit in an unsigned long.
2005                  */
2006                 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2007                         return -EINVAL;
2008         } else { /* Modify an existing slot. */
2009                 /* Private memslots are immutable, they can only be deleted. */
2010                 if (mem->flags & KVM_MEM_GUEST_MEMFD)
2011                         return -EINVAL;
2012                 if ((mem->userspace_addr != old->userspace_addr) ||
2013                     (npages != old->npages) ||
2014                     ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
2015                         return -EINVAL;
2016
2017                 if (base_gfn != old->base_gfn)
2018                         change = KVM_MR_MOVE;
2019                 else if (mem->flags != old->flags)
2020                         change = KVM_MR_FLAGS_ONLY;
2021                 else /* Nothing to change. */
2022                         return 0;
2023         }
2024
2025         if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
2026             kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
2027                 return -EEXIST;
2028
2029         /* Allocate a slot that will persist in the memslot. */
2030         new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
2031         if (!new)
2032                 return -ENOMEM;
2033
2034         new->as_id = as_id;
2035         new->id = id;
2036         new->base_gfn = base_gfn;
2037         new->npages = npages;
2038         new->flags = mem->flags;
2039         new->userspace_addr = mem->userspace_addr;
2040         if (mem->flags & KVM_MEM_GUEST_MEMFD) {
2041                 r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
2042                 if (r)
2043                         goto out;
2044         }
2045
2046         r = kvm_set_memslot(kvm, old, new, change);
2047         if (r)
2048                 goto out_unbind;
2049
2050         return 0;
2051
2052 out_unbind:
2053         if (mem->flags & KVM_MEM_GUEST_MEMFD)
2054                 kvm_gmem_unbind(new);
2055 out:
2056         kfree(new);
2057         return r;
2058 }
2059 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2060
2061 int kvm_set_memory_region(struct kvm *kvm,
2062                           const struct kvm_userspace_memory_region2 *mem)
2063 {
2064         int r;
2065
2066         mutex_lock(&kvm->slots_lock);
2067         r = __kvm_set_memory_region(kvm, mem);
2068         mutex_unlock(&kvm->slots_lock);
2069         return r;
2070 }
2071 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2072
2073 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2074                                           struct kvm_userspace_memory_region2 *mem)
2075 {
2076         if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
2077                 return -EINVAL;
2078
2079         return kvm_set_memory_region(kvm, mem);
2080 }
2081
2082 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2083 /**
2084  * kvm_get_dirty_log - get a snapshot of dirty pages
2085  * @kvm:        pointer to kvm instance
2086  * @log:        slot id and address to which we copy the log
2087  * @is_dirty:   set to '1' if any dirty pages were found
2088  * @memslot:    set to the associated memslot, always valid on success
2089  */
2090 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
2091                       int *is_dirty, struct kvm_memory_slot **memslot)
2092 {
2093         struct kvm_memslots *slots;
2094         int i, as_id, id;
2095         unsigned long n;
2096         unsigned long any = 0;
2097
2098         /* Dirty ring tracking may be exclusive to dirty log tracking */
2099         if (!kvm_use_dirty_bitmap(kvm))
2100                 return -ENXIO;
2101
2102         *memslot = NULL;
2103         *is_dirty = 0;
2104
2105         as_id = log->slot >> 16;
2106         id = (u16)log->slot;
2107         if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2108                 return -EINVAL;
2109
2110         slots = __kvm_memslots(kvm, as_id);
2111         *memslot = id_to_memslot(slots, id);
2112         if (!(*memslot) || !(*memslot)->dirty_bitmap)
2113                 return -ENOENT;
2114
2115         kvm_arch_sync_dirty_log(kvm, *memslot);
2116
2117         n = kvm_dirty_bitmap_bytes(*memslot);
2118
2119         for (i = 0; !any && i < n/sizeof(long); ++i)
2120                 any = (*memslot)->dirty_bitmap[i];
2121
2122         if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2123                 return -EFAULT;
2124
2125         if (any)
2126                 *is_dirty = 1;
2127         return 0;
2128 }
2129 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
2130
2131 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2132 /**
2133  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
2134  *      and reenable dirty page tracking for the corresponding pages.
2135  * @kvm:        pointer to kvm instance
2136  * @log:        slot id and address to which we copy the log
2137  *
2138  * We need to keep it in mind that VCPU threads can write to the bitmap
2139  * concurrently. So, to avoid losing track of dirty pages we keep the
2140  * following order:
2141  *
2142  *    1. Take a snapshot of the bit and clear it if needed.
2143  *    2. Write protect the corresponding page.
2144  *    3. Copy the snapshot to the userspace.
2145  *    4. Upon return caller flushes TLB's if needed.
2146  *
2147  * Between 2 and 4, the guest may write to the page using the remaining TLB
2148  * entry.  This is not a problem because the page is reported dirty using
2149  * the snapshot taken before and step 4 ensures that writes done after
2150  * exiting to userspace will be logged for the next call.
2151  *
2152  */
2153 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
2154 {
2155         struct kvm_memslots *slots;
2156         struct kvm_memory_slot *memslot;
2157         int i, as_id, id;
2158         unsigned long n;
2159         unsigned long *dirty_bitmap;
2160         unsigned long *dirty_bitmap_buffer;
2161         bool flush;
2162
2163         /* Dirty ring tracking may be exclusive to dirty log tracking */
2164         if (!kvm_use_dirty_bitmap(kvm))
2165                 return -ENXIO;
2166
2167         as_id = log->slot >> 16;
2168         id = (u16)log->slot;
2169         if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2170                 return -EINVAL;
2171
2172         slots = __kvm_memslots(kvm, as_id);
2173         memslot = id_to_memslot(slots, id);
2174         if (!memslot || !memslot->dirty_bitmap)
2175                 return -ENOENT;
2176
2177         dirty_bitmap = memslot->dirty_bitmap;
2178
2179         kvm_arch_sync_dirty_log(kvm, memslot);
2180
2181         n = kvm_dirty_bitmap_bytes(memslot);
2182         flush = false;
2183         if (kvm->manual_dirty_log_protect) {
2184                 /*
2185                  * Unlike kvm_get_dirty_log, we always return false in *flush,
2186                  * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
2187                  * is some code duplication between this function and
2188                  * kvm_get_dirty_log, but hopefully all architecture
2189                  * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2190                  * can be eliminated.
2191                  */
2192                 dirty_bitmap_buffer = dirty_bitmap;
2193         } else {
2194                 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2195                 memset(dirty_bitmap_buffer, 0, n);
2196
2197                 KVM_MMU_LOCK(kvm);
2198                 for (i = 0; i < n / sizeof(long); i++) {
2199                         unsigned long mask;
2200                         gfn_t offset;
2201
2202                         if (!dirty_bitmap[i])
2203                                 continue;
2204
2205                         flush = true;
2206                         mask = xchg(&dirty_bitmap[i], 0);
2207                         dirty_bitmap_buffer[i] = mask;
2208
2209                         offset = i * BITS_PER_LONG;
2210                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2211                                                                 offset, mask);
2212                 }
2213                 KVM_MMU_UNLOCK(kvm);
2214         }
2215
2216         if (flush)
2217                 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2218
2219         if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
2220                 return -EFAULT;
2221         return 0;
2222 }
2223
2224
2225 /**
2226  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2227  * @kvm: kvm instance
2228  * @log: slot id and address to which we copy the log
2229  *
2230  * Steps 1-4 below provide general overview of dirty page logging. See
2231  * kvm_get_dirty_log_protect() function description for additional details.
2232  *
2233  * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2234  * always flush the TLB (step 4) even if previous step failed  and the dirty
2235  * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2236  * does not preclude user space subsequent dirty log read. Flushing TLB ensures
2237  * writes will be marked dirty for next log read.
2238  *
2239  *   1. Take a snapshot of the bit and clear it if needed.
2240  *   2. Write protect the corresponding page.
2241  *   3. Copy the snapshot to the userspace.
2242  *   4. Flush TLB's if needed.
2243  */
2244 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2245                                       struct kvm_dirty_log *log)
2246 {
2247         int r;
2248
2249         mutex_lock(&kvm->slots_lock);
2250
2251         r = kvm_get_dirty_log_protect(kvm, log);
2252
2253         mutex_unlock(&kvm->slots_lock);
2254         return r;
2255 }
2256
2257 /**
2258  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2259  *      and reenable dirty page tracking for the corresponding pages.
2260  * @kvm:        pointer to kvm instance
2261  * @log:        slot id and address from which to fetch the bitmap of dirty pages
2262  */
2263 static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2264                                        struct kvm_clear_dirty_log *log)
2265 {
2266         struct kvm_memslots *slots;
2267         struct kvm_memory_slot *memslot;
2268         int as_id, id;
2269         gfn_t offset;
2270         unsigned long i, n;
2271         unsigned long *dirty_bitmap;
2272         unsigned long *dirty_bitmap_buffer;
2273         bool flush;
2274
2275         /* Dirty ring tracking may be exclusive to dirty log tracking */
2276         if (!kvm_use_dirty_bitmap(kvm))
2277                 return -ENXIO;
2278
2279         as_id = log->slot >> 16;
2280         id = (u16)log->slot;
2281         if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
2282                 return -EINVAL;
2283
2284         if (log->first_page & 63)
2285                 return -EINVAL;
2286
2287         slots = __kvm_memslots(kvm, as_id);
2288         memslot = id_to_memslot(slots, id);
2289         if (!memslot || !memslot->dirty_bitmap)
2290                 return -ENOENT;
2291
2292         dirty_bitmap = memslot->dirty_bitmap;
2293
2294         n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2295
2296         if (log->first_page > memslot->npages ||
2297             log->num_pages > memslot->npages - log->first_page ||
2298             (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2299             return -EINVAL;
2300
2301         kvm_arch_sync_dirty_log(kvm, memslot);
2302
2303         flush = false;
2304         dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2305         if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2306                 return -EFAULT;
2307
2308         KVM_MMU_LOCK(kvm);
2309         for (offset = log->first_page, i = offset / BITS_PER_LONG,
2310                  n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2311              i++, offset += BITS_PER_LONG) {
2312                 unsigned long mask = *dirty_bitmap_buffer++;
2313                 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2314                 if (!mask)
2315                         continue;
2316
2317                 mask &= atomic_long_fetch_andnot(mask, p);
2318
2319                 /*
2320                  * mask contains the bits that really have been cleared.  This
2321                  * never includes any bits beyond the length of the memslot (if
2322                  * the length is not aligned to 64 pages), therefore it is not
2323                  * a problem if userspace sets them in log->dirty_bitmap.
2324                 */
2325                 if (mask) {
2326                         flush = true;
2327                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2328                                                                 offset, mask);
2329                 }
2330         }
2331         KVM_MMU_UNLOCK(kvm);
2332
2333         if (flush)
2334                 kvm_flush_remote_tlbs_memslot(kvm, memslot);
2335
2336         return 0;
2337 }
2338
2339 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2340                                         struct kvm_clear_dirty_log *log)
2341 {
2342         int r;
2343
2344         mutex_lock(&kvm->slots_lock);
2345
2346         r = kvm_clear_dirty_log_protect(kvm, log);
2347
2348         mutex_unlock(&kvm->slots_lock);
2349         return r;
2350 }
2351 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2352
2353 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
2354 static u64 kvm_supported_mem_attributes(struct kvm *kvm)
2355 {
2356         if (!kvm || kvm_arch_has_private_mem(kvm))
2357                 return KVM_MEMORY_ATTRIBUTE_PRIVATE;
2358
2359         return 0;
2360 }
2361
2362 /*
2363  * Returns true if _all_ gfns in the range [@start, @end) have attributes
2364  * such that the bits in @mask match @attrs.
2365  */
2366 bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2367                                      unsigned long mask, unsigned long attrs)
2368 {
2369         XA_STATE(xas, &kvm->mem_attr_array, start);
2370         unsigned long index;
2371         void *entry;
2372
2373         mask &= kvm_supported_mem_attributes(kvm);
2374         if (attrs & ~mask)
2375                 return false;
2376
2377         if (end == start + 1)
2378                 return (kvm_get_memory_attributes(kvm, start) & mask) == attrs;
2379
2380         guard(rcu)();
2381         if (!attrs)
2382                 return !xas_find(&xas, end - 1);
2383
2384         for (index = start; index < end; index++) {
2385                 do {
2386                         entry = xas_next(&xas);
2387                 } while (xas_retry(&xas, entry));
2388
2389                 if (xas.xa_index != index ||
2390                     (xa_to_value(entry) & mask) != attrs)
2391                         return false;
2392         }
2393
2394         return true;
2395 }
2396
2397 static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
2398                                                  struct kvm_mmu_notifier_range *range)
2399 {
2400         struct kvm_gfn_range gfn_range;
2401         struct kvm_memory_slot *slot;
2402         struct kvm_memslots *slots;
2403         struct kvm_memslot_iter iter;
2404         bool found_memslot = false;
2405         bool ret = false;
2406         int i;
2407
2408         gfn_range.arg = range->arg;
2409         gfn_range.may_block = range->may_block;
2410
2411         for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
2412                 slots = __kvm_memslots(kvm, i);
2413
2414                 kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
2415                         slot = iter.slot;
2416                         gfn_range.slot = slot;
2417
2418                         gfn_range.start = max(range->start, slot->base_gfn);
2419                         gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
2420                         if (gfn_range.start >= gfn_range.end)
2421                                 continue;
2422
2423                         if (!found_memslot) {
2424                                 found_memslot = true;
2425                                 KVM_MMU_LOCK(kvm);
2426                                 if (!IS_KVM_NULL_FN(range->on_lock))
2427                                         range->on_lock(kvm);
2428                         }
2429
2430                         ret |= range->handler(kvm, &gfn_range);
2431                 }
2432         }
2433
2434         if (range->flush_on_ret && ret)
2435                 kvm_flush_remote_tlbs(kvm);
2436
2437         if (found_memslot)
2438                 KVM_MMU_UNLOCK(kvm);
2439 }
2440
2441 static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
2442                                           struct kvm_gfn_range *range)
2443 {
2444         /*
2445          * Unconditionally add the range to the invalidation set, regardless of
2446          * whether or not the arch callback actually needs to zap SPTEs.  E.g.
2447          * if KVM supports RWX attributes in the future and the attributes are
2448          * going from R=>RW, zapping isn't strictly necessary.  Unconditionally
2449          * adding the range allows KVM to require that MMU invalidations add at
2450          * least one range between begin() and end(), e.g. allows KVM to detect
2451          * bugs where the add() is missed.  Relaxing the rule *might* be safe,
2452          * but it's not obvious that allowing new mappings while the attributes
2453          * are in flux is desirable or worth the complexity.
2454          */
2455         kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
2456
2457         return kvm_arch_pre_set_memory_attributes(kvm, range);
2458 }
2459
2460 /* Set @attributes for the gfn range [@start, @end). */
2461 static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
2462                                      unsigned long attributes)
2463 {
2464         struct kvm_mmu_notifier_range pre_set_range = {
2465                 .start = start,
2466                 .end = end,
2467                 .handler = kvm_pre_set_memory_attributes,
2468                 .on_lock = kvm_mmu_invalidate_begin,
2469                 .flush_on_ret = true,
2470                 .may_block = true,
2471         };
2472         struct kvm_mmu_notifier_range post_set_range = {
2473                 .start = start,
2474                 .end = end,
2475                 .arg.attributes = attributes,
2476                 .handler = kvm_arch_post_set_memory_attributes,
2477                 .on_lock = kvm_mmu_invalidate_end,
2478                 .may_block = true,
2479         };
2480         unsigned long i;
2481         void *entry;
2482         int r = 0;
2483
2484         entry = attributes ? xa_mk_value(attributes) : NULL;
2485
2486         mutex_lock(&kvm->slots_lock);
2487
2488         /* Nothing to do if the entire range as the desired attributes. */
2489         if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
2490                 goto out_unlock;
2491
2492         /*
2493          * Reserve memory ahead of time to avoid having to deal with failures
2494          * partway through setting the new attributes.
2495          */
2496         for (i = start; i < end; i++) {
2497                 r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
2498                 if (r)
2499                         goto out_unlock;
2500         }
2501
2502         kvm_handle_gfn_range(kvm, &pre_set_range);
2503
2504         for (i = start; i < end; i++) {
2505                 r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
2506                                     GFP_KERNEL_ACCOUNT));
2507                 KVM_BUG_ON(r, kvm);
2508         }
2509
2510         kvm_handle_gfn_range(kvm, &post_set_range);
2511
2512 out_unlock:
2513         mutex_unlock(&kvm->slots_lock);
2514
2515         return r;
2516 }
2517 static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
2518                                            struct kvm_memory_attributes *attrs)
2519 {
2520         gfn_t start, end;
2521
2522         /* flags is currently not used. */
2523         if (attrs->flags)
2524                 return -EINVAL;
2525         if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
2526                 return -EINVAL;
2527         if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
2528                 return -EINVAL;
2529         if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
2530                 return -EINVAL;
2531
2532         start = attrs->address >> PAGE_SHIFT;
2533         end = (attrs->address + attrs->size) >> PAGE_SHIFT;
2534
2535         /*
2536          * xarray tracks data using "unsigned long", and as a result so does
2537          * KVM.  For simplicity, supports generic attributes only on 64-bit
2538          * architectures.
2539          */
2540         BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
2541
2542         return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
2543 }
2544 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
2545
2546 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2547 {
2548         return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2549 }
2550 EXPORT_SYMBOL_GPL(gfn_to_memslot);
2551
2552 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2553 {
2554         struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2555         u64 gen = slots->generation;
2556         struct kvm_memory_slot *slot;
2557
2558         /*
2559          * This also protects against using a memslot from a different address space,
2560          * since different address spaces have different generation numbers.
2561          */
2562         if (unlikely(gen != vcpu->last_used_slot_gen)) {
2563                 vcpu->last_used_slot = NULL;
2564                 vcpu->last_used_slot_gen = gen;
2565         }
2566
2567         slot = try_get_memslot(vcpu->last_used_slot, gfn);
2568         if (slot)
2569                 return slot;
2570
2571         /*
2572          * Fall back to searching all memslots. We purposely use
2573          * search_memslots() instead of __gfn_to_memslot() to avoid
2574          * thrashing the VM-wide last_used_slot in kvm_memslots.
2575          */
2576         slot = search_memslots(slots, gfn, false);
2577         if (slot) {
2578                 vcpu->last_used_slot = slot;
2579                 return slot;
2580         }
2581
2582         return NULL;
2583 }
2584
2585 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2586 {
2587         struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2588
2589         return kvm_is_visible_memslot(memslot);
2590 }
2591 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2592
2593 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2594 {
2595         struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2596
2597         return kvm_is_visible_memslot(memslot);
2598 }
2599 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2600
2601 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2602 {
2603         struct vm_area_struct *vma;
2604         unsigned long addr, size;
2605
2606         size = PAGE_SIZE;
2607
2608         addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2609         if (kvm_is_error_hva(addr))
2610                 return PAGE_SIZE;
2611
2612         mmap_read_lock(current->mm);
2613         vma = find_vma(current->mm, addr);
2614         if (!vma)
2615                 goto out;
2616
2617         size = vma_kernel_pagesize(vma);
2618
2619 out:
2620         mmap_read_unlock(current->mm);
2621
2622         return size;
2623 }
2624
2625 static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2626 {
2627         return slot->flags & KVM_MEM_READONLY;
2628 }
2629
2630 static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2631                                        gfn_t *nr_pages, bool write)
2632 {
2633         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2634                 return KVM_HVA_ERR_BAD;
2635
2636         if (memslot_is_readonly(slot) && write)
2637                 return KVM_HVA_ERR_RO_BAD;
2638
2639         if (nr_pages)
2640                 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2641
2642         return __gfn_to_hva_memslot(slot, gfn);
2643 }
2644
2645 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2646                                      gfn_t *nr_pages)
2647 {
2648         return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2649 }
2650
2651 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2652                                         gfn_t gfn)
2653 {
2654         return gfn_to_hva_many(slot, gfn, NULL);
2655 }
2656 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2657
2658 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2659 {
2660         return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2661 }
2662 EXPORT_SYMBOL_GPL(gfn_to_hva);
2663
2664 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2665 {
2666         return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2667 }
2668 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2669
2670 /*
2671  * Return the hva of a @gfn and the R/W attribute if possible.
2672  *
2673  * @slot: the kvm_memory_slot which contains @gfn
2674  * @gfn: the gfn to be translated
2675  * @writable: used to return the read/write attribute of the @slot if the hva
2676  * is valid and @writable is not NULL
2677  */
2678 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2679                                       gfn_t gfn, bool *writable)
2680 {
2681         unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2682
2683         if (!kvm_is_error_hva(hva) && writable)
2684                 *writable = !memslot_is_readonly(slot);
2685
2686         return hva;
2687 }
2688
2689 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2690 {
2691         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2692
2693         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2694 }
2695
2696 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2697 {
2698         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2699
2700         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2701 }
2702
2703 static bool kvm_is_ad_tracked_page(struct page *page)
2704 {
2705         /*
2706          * Per page-flags.h, pages tagged PG_reserved "should in general not be
2707          * touched (e.g. set dirty) except by its owner".
2708          */
2709         return !PageReserved(page);
2710 }
2711
2712 static void kvm_set_page_dirty(struct page *page)
2713 {
2714         if (kvm_is_ad_tracked_page(page))
2715                 SetPageDirty(page);
2716 }
2717
2718 static void kvm_set_page_accessed(struct page *page)
2719 {
2720         if (kvm_is_ad_tracked_page(page))
2721                 mark_page_accessed(page);
2722 }
2723
2724 void kvm_release_page_clean(struct page *page)
2725 {
2726         if (!page)
2727                 return;
2728
2729         kvm_set_page_accessed(page);
2730         put_page(page);
2731 }
2732 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2733
2734 void kvm_release_page_dirty(struct page *page)
2735 {
2736         if (!page)
2737                 return;
2738
2739         kvm_set_page_dirty(page);
2740         kvm_release_page_clean(page);
2741 }
2742 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2743
2744 static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
2745                                  struct follow_pfnmap_args *map, bool writable)
2746 {
2747         kvm_pfn_t pfn;
2748
2749         WARN_ON_ONCE(!!page == !!map);
2750
2751         if (kfp->map_writable)
2752                 *kfp->map_writable = writable;
2753
2754         if (map)
2755                 pfn = map->pfn;
2756         else
2757                 pfn = page_to_pfn(page);
2758
2759         *kfp->refcounted_page = page;
2760
2761         return pfn;
2762 }
2763
2764 /*
2765  * The fast path to get the writable pfn which will be stored in @pfn,
2766  * true indicates success, otherwise false is returned.
2767  */
2768 static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
2769 {
2770         struct page *page;
2771         bool r;
2772
2773         /*
2774          * Try the fast-only path when the caller wants to pin/get the page for
2775          * writing.  If the caller only wants to read the page, KVM must go
2776          * down the full, slow path in order to avoid racing an operation that
2777          * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing
2778          * at the old, read-only page while mm/ points at a new, writable page.
2779          */
2780         if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable))
2781                 return false;
2782
2783         if (kfp->pin)
2784                 r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1;
2785         else
2786                 r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page);
2787
2788         if (r) {
2789                 *pfn = kvm_resolve_pfn(kfp, page, NULL, true);
2790                 return true;
2791         }
2792
2793         return false;
2794 }
2795
2796 /*
2797  * The slow path to get the pfn of the specified host virtual address,
2798  * 1 indicates success, -errno is returned if error is detected.
2799  */
2800 static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
2801 {
2802         /*
2803          * When a VCPU accesses a page that is not mapped into the secondary
2804          * MMU, we lookup the page using GUP to map it, so the guest VCPU can
2805          * make progress. We always want to honor NUMA hinting faults in that
2806          * case, because GUP usage corresponds to memory accesses from the VCPU.
2807          * Otherwise, we'd not trigger NUMA hinting faults once a page is
2808          * mapped into the secondary MMU and gets accessed by a VCPU.
2809          *
2810          * Note that get_user_page_fast_only() and FOLL_WRITE for now
2811          * implicitly honor NUMA hinting faults and don't need this flag.
2812          */
2813         unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags;
2814         struct page *page, *wpage;
2815         int npages;
2816
2817         if (kfp->pin)
2818                 npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags);
2819         else
2820                 npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
2821         if (npages != 1)
2822                 return npages;
2823
2824         /*
2825          * Pinning is mutually exclusive with opportunistically mapping a read
2826          * fault as writable, as KVM should never pin pages when mapping memory
2827          * into the guest (pinning is only for direct accesses from KVM).
2828          */
2829         if (WARN_ON_ONCE(kfp->map_writable && kfp->pin))
2830                 goto out;
2831
2832         /* map read fault as writable if possible */
2833         if (!(flags & FOLL_WRITE) && kfp->map_writable &&
2834             get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) {
2835                 put_page(page);
2836                 page = wpage;
2837                 flags |= FOLL_WRITE;
2838         }
2839
2840 out:
2841         *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE);
2842         return npages;
2843 }
2844
2845 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2846 {
2847         if (unlikely(!(vma->vm_flags & VM_READ)))
2848                 return false;
2849
2850         if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2851                 return false;
2852
2853         return true;
2854 }
2855
2856 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2857                                struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn)
2858 {
2859         struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva };
2860         bool write_fault = kfp->flags & FOLL_WRITE;
2861         int r;
2862
2863         /*
2864          * Remapped memory cannot be pinned in any meaningful sense.  Bail if
2865          * the caller wants to pin the page, i.e. access the page outside of
2866          * MMU notifier protection, and unsafe umappings are disallowed.
2867          */
2868         if (kfp->pin && !allow_unsafe_mappings)
2869                 return -EINVAL;
2870
2871         r = follow_pfnmap_start(&args);
2872         if (r) {
2873                 /*
2874                  * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2875                  * not call the fault handler, so do it here.
2876                  */
2877                 bool unlocked = false;
2878                 r = fixup_user_fault(current->mm, kfp->hva,
2879                                      (write_fault ? FAULT_FLAG_WRITE : 0),
2880                                      &unlocked);
2881                 if (unlocked)
2882                         return -EAGAIN;
2883                 if (r)
2884                         return r;
2885
2886                 r = follow_pfnmap_start(&args);
2887                 if (r)
2888                         return r;
2889         }
2890
2891         if (write_fault && !args.writable) {
2892                 *p_pfn = KVM_PFN_ERR_RO_FAULT;
2893                 goto out;
2894         }
2895
2896         *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable);
2897 out:
2898         follow_pfnmap_end(&args);
2899         return r;
2900 }
2901
2902 kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp)
2903 {
2904         struct vm_area_struct *vma;
2905         kvm_pfn_t pfn;
2906         int npages, r;
2907
2908         might_sleep();
2909
2910         if (WARN_ON_ONCE(!kfp->refcounted_page))
2911                 return KVM_PFN_ERR_FAULT;
2912
2913         if (hva_to_pfn_fast(kfp, &pfn))
2914                 return pfn;
2915
2916         npages = hva_to_pfn_slow(kfp, &pfn);
2917         if (npages == 1)
2918                 return pfn;
2919         if (npages == -EINTR || npages == -EAGAIN)
2920                 return KVM_PFN_ERR_SIGPENDING;
2921         if (npages == -EHWPOISON)
2922                 return KVM_PFN_ERR_HWPOISON;
2923
2924         mmap_read_lock(current->mm);
2925 retry:
2926         vma = vma_lookup(current->mm, kfp->hva);
2927
2928         if (vma == NULL)
2929                 pfn = KVM_PFN_ERR_FAULT;
2930         else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2931                 r = hva_to_pfn_remapped(vma, kfp, &pfn);
2932                 if (r == -EAGAIN)
2933                         goto retry;
2934                 if (r < 0)
2935                         pfn = KVM_PFN_ERR_FAULT;
2936         } else {
2937                 if ((kfp->flags & FOLL_NOWAIT) &&
2938                     vma_is_valid(vma, kfp->flags & FOLL_WRITE))
2939                         pfn = KVM_PFN_ERR_NEEDS_IO;
2940                 else
2941                         pfn = KVM_PFN_ERR_FAULT;
2942         }
2943         mmap_read_unlock(current->mm);
2944         return pfn;
2945 }
2946
2947 static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp)
2948 {
2949         kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL,
2950                                      kfp->flags & FOLL_WRITE);
2951
2952         if (kfp->hva == KVM_HVA_ERR_RO_BAD)
2953                 return KVM_PFN_ERR_RO_FAULT;
2954
2955         if (kvm_is_error_hva(kfp->hva))
2956                 return KVM_PFN_NOSLOT;
2957
2958         if (memslot_is_readonly(kfp->slot) && kfp->map_writable) {
2959                 *kfp->map_writable = false;
2960                 kfp->map_writable = NULL;
2961         }
2962
2963         return hva_to_pfn(kfp);
2964 }
2965
2966 kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn,
2967                             unsigned int foll, bool *writable,
2968                             struct page **refcounted_page)
2969 {
2970         struct kvm_follow_pfn kfp = {
2971                 .slot = slot,
2972                 .gfn = gfn,
2973                 .flags = foll,
2974                 .map_writable = writable,
2975                 .refcounted_page = refcounted_page,
2976         };
2977
2978         if (WARN_ON_ONCE(!writable || !refcounted_page))
2979                 return KVM_PFN_ERR_FAULT;
2980
2981         *writable = false;
2982         *refcounted_page = NULL;
2983
2984         return kvm_follow_pfn(&kfp);
2985 }
2986 EXPORT_SYMBOL_GPL(__kvm_faultin_pfn);
2987
2988 int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
2989                        struct page **pages, int nr_pages)
2990 {
2991         unsigned long addr;
2992         gfn_t entry = 0;
2993
2994         addr = gfn_to_hva_many(slot, gfn, &entry);
2995         if (kvm_is_error_hva(addr))
2996                 return -1;
2997
2998         if (entry < nr_pages)
2999                 return 0;
3000
3001         return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
3002 }
3003 EXPORT_SYMBOL_GPL(kvm_prefetch_pages);
3004
3005 /*
3006  * Don't use this API unless you are absolutely, positively certain that KVM
3007  * needs to get a struct page, e.g. to pin the page for firmware DMA.
3008  *
3009  * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate
3010  *        its refcount.
3011  */
3012 struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write)
3013 {
3014         struct page *refcounted_page = NULL;
3015         struct kvm_follow_pfn kfp = {
3016                 .slot = gfn_to_memslot(kvm, gfn),
3017                 .gfn = gfn,
3018                 .flags = write ? FOLL_WRITE : 0,
3019                 .refcounted_page = &refcounted_page,
3020         };
3021
3022         (void)kvm_follow_pfn(&kfp);
3023         return refcounted_page;
3024 }
3025 EXPORT_SYMBOL_GPL(__gfn_to_page);
3026
3027 int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
3028                    bool writable)
3029 {
3030         struct kvm_follow_pfn kfp = {
3031                 .slot = gfn_to_memslot(vcpu->kvm, gfn),
3032                 .gfn = gfn,
3033                 .flags = writable ? FOLL_WRITE : 0,
3034                 .refcounted_page = &map->pinned_page,
3035                 .pin = true,
3036         };
3037
3038         map->pinned_page = NULL;
3039         map->page = NULL;
3040         map->hva = NULL;
3041         map->gfn = gfn;
3042         map->writable = writable;
3043
3044         map->pfn = kvm_follow_pfn(&kfp);
3045         if (is_error_noslot_pfn(map->pfn))
3046                 return -EINVAL;
3047
3048         if (pfn_valid(map->pfn)) {
3049                 map->page = pfn_to_page(map->pfn);
3050                 map->hva = kmap(map->page);
3051 #ifdef CONFIG_HAS_IOMEM
3052         } else {
3053                 map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB);
3054 #endif
3055         }
3056
3057         return map->hva ? 0 : -EFAULT;
3058 }
3059 EXPORT_SYMBOL_GPL(__kvm_vcpu_map);
3060
3061 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map)
3062 {
3063         if (!map->hva)
3064                 return;
3065
3066         if (map->page)
3067                 kunmap(map->page);
3068 #ifdef CONFIG_HAS_IOMEM
3069         else
3070                 memunmap(map->hva);
3071 #endif
3072
3073         if (map->writable)
3074                 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
3075
3076         if (map->pinned_page) {
3077                 if (map->writable)
3078                         kvm_set_page_dirty(map->pinned_page);
3079                 kvm_set_page_accessed(map->pinned_page);
3080                 unpin_user_page(map->pinned_page);
3081         }
3082
3083         map->hva = NULL;
3084         map->page = NULL;
3085         map->pinned_page = NULL;
3086 }
3087 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
3088
3089 static int next_segment(unsigned long len, int offset)
3090 {
3091         if (len > PAGE_SIZE - offset)
3092                 return PAGE_SIZE - offset;
3093         else
3094                 return len;
3095 }
3096
3097 /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
3098 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3099                                  void *data, int offset, int len)
3100 {
3101         int r;
3102         unsigned long addr;
3103
3104         if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
3105                 return -EFAULT;
3106
3107         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3108         if (kvm_is_error_hva(addr))
3109                 return -EFAULT;
3110         r = __copy_from_user(data, (void __user *)addr + offset, len);
3111         if (r)
3112                 return -EFAULT;
3113         return 0;
3114 }
3115
3116 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
3117                         int len)
3118 {
3119         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3120
3121         return __kvm_read_guest_page(slot, gfn, data, offset, len);
3122 }
3123 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3124
3125 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
3126                              int offset, int len)
3127 {
3128         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3129
3130         return __kvm_read_guest_page(slot, gfn, data, offset, len);
3131 }
3132 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3133
3134 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
3135 {
3136         gfn_t gfn = gpa >> PAGE_SHIFT;
3137         int seg;
3138         int offset = offset_in_page(gpa);
3139         int ret;
3140
3141         while ((seg = next_segment(len, offset)) != 0) {
3142                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3143                 if (ret < 0)
3144                         return ret;
3145                 offset = 0;
3146                 len -= seg;
3147                 data += seg;
3148                 ++gfn;
3149         }
3150         return 0;
3151 }
3152 EXPORT_SYMBOL_GPL(kvm_read_guest);
3153
3154 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
3155 {
3156         gfn_t gfn = gpa >> PAGE_SHIFT;
3157         int seg;
3158         int offset = offset_in_page(gpa);
3159         int ret;
3160
3161         while ((seg = next_segment(len, offset)) != 0) {
3162                 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3163                 if (ret < 0)
3164                         return ret;
3165                 offset = 0;
3166                 len -= seg;
3167                 data += seg;
3168                 ++gfn;
3169         }
3170         return 0;
3171 }
3172 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
3173
3174 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3175                                    void *data, int offset, unsigned long len)
3176 {
3177         int r;
3178         unsigned long addr;
3179
3180         if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
3181                 return -EFAULT;
3182
3183         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3184         if (kvm_is_error_hva(addr))
3185                 return -EFAULT;
3186         pagefault_disable();
3187         r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
3188         pagefault_enable();
3189         if (r)
3190                 return -EFAULT;
3191         return 0;
3192 }
3193
3194 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3195                                void *data, unsigned long len)
3196 {
3197         gfn_t gfn = gpa >> PAGE_SHIFT;
3198         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3199         int offset = offset_in_page(gpa);
3200
3201         return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3202 }
3203 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3204
3205 /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
3206 static int __kvm_write_guest_page(struct kvm *kvm,
3207                                   struct kvm_memory_slot *memslot, gfn_t gfn,
3208                                   const void *data, int offset, int len)
3209 {
3210         int r;
3211         unsigned long addr;
3212
3213         if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
3214                 return -EFAULT;
3215
3216         addr = gfn_to_hva_memslot(memslot, gfn);
3217         if (kvm_is_error_hva(addr))
3218                 return -EFAULT;
3219         r = __copy_to_user((void __user *)addr + offset, data, len);
3220         if (r)
3221                 return -EFAULT;
3222         mark_page_dirty_in_slot(kvm, memslot, gfn);
3223         return 0;
3224 }
3225
3226 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3227                          const void *data, int offset, int len)
3228 {
3229         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3230
3231         return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
3232 }
3233 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3234
3235 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3236                               const void *data, int offset, int len)
3237 {
3238         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3239
3240         return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
3241 }
3242 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3243
3244 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
3245                     unsigned long len)
3246 {
3247         gfn_t gfn = gpa >> PAGE_SHIFT;
3248         int seg;
3249         int offset = offset_in_page(gpa);
3250         int ret;
3251
3252         while ((seg = next_segment(len, offset)) != 0) {
3253                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3254                 if (ret < 0)
3255                         return ret;
3256                 offset = 0;
3257                 len -= seg;
3258                 data += seg;
3259                 ++gfn;
3260         }
3261         return 0;
3262 }
3263 EXPORT_SYMBOL_GPL(kvm_write_guest);
3264
3265 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
3266                          unsigned long len)
3267 {
3268         gfn_t gfn = gpa >> PAGE_SHIFT;
3269         int seg;
3270         int offset = offset_in_page(gpa);
3271         int ret;
3272
3273         while ((seg = next_segment(len, offset)) != 0) {
3274                 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3275                 if (ret < 0)
3276                         return ret;
3277                 offset = 0;
3278                 len -= seg;
3279                 data += seg;
3280                 ++gfn;
3281         }
3282         return 0;
3283 }
3284 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3285
3286 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3287                                        struct gfn_to_hva_cache *ghc,
3288                                        gpa_t gpa, unsigned long len)
3289 {
3290         int offset = offset_in_page(gpa);
3291         gfn_t start_gfn = gpa >> PAGE_SHIFT;
3292         gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
3293         gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
3294         gfn_t nr_pages_avail;
3295
3296         /* Update ghc->generation before performing any error checks. */
3297         ghc->generation = slots->generation;
3298
3299         if (start_gfn > end_gfn) {
3300                 ghc->hva = KVM_HVA_ERR_BAD;
3301                 return -EINVAL;
3302         }
3303
3304         /*
3305          * If the requested region crosses two memslots, we still
3306          * verify that the entire region is valid here.
3307          */
3308         for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3309                 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
3310                 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
3311                                            &nr_pages_avail);
3312                 if (kvm_is_error_hva(ghc->hva))
3313                         return -EFAULT;
3314         }
3315
3316         /* Use the slow path for cross page reads and writes. */
3317         if (nr_pages_needed == 1)
3318                 ghc->hva += offset;
3319         else
3320                 ghc->memslot = NULL;
3321
3322         ghc->gpa = gpa;
3323         ghc->len = len;
3324         return 0;
3325 }
3326
3327 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3328                               gpa_t gpa, unsigned long len)
3329 {
3330         struct kvm_memslots *slots = kvm_memslots(kvm);
3331         return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3332 }
3333 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3334
3335 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3336                                   void *data, unsigned int offset,
3337                                   unsigned long len)
3338 {
3339         struct kvm_memslots *slots = kvm_memslots(kvm);
3340         int r;
3341         gpa_t gpa = ghc->gpa + offset;
3342
3343         if (WARN_ON_ONCE(len + offset > ghc->len))
3344                 return -EINVAL;
3345
3346         if (slots->generation != ghc->generation) {
3347                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3348                         return -EFAULT;
3349         }
3350
3351         if (kvm_is_error_hva(ghc->hva))
3352                 return -EFAULT;
3353
3354         if (unlikely(!ghc->memslot))
3355                 return kvm_write_guest(kvm, gpa, data, len);
3356
3357         r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3358         if (r)
3359                 return -EFAULT;
3360         mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3361
3362         return 0;
3363 }
3364 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3365
3366 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3367                            void *data, unsigned long len)
3368 {
3369         return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3370 }
3371 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3372
3373 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3374                                  void *data, unsigned int offset,
3375                                  unsigned long len)
3376 {
3377         struct kvm_memslots *slots = kvm_memslots(kvm);
3378         int r;
3379         gpa_t gpa = ghc->gpa + offset;
3380
3381         if (WARN_ON_ONCE(len + offset > ghc->len))
3382                 return -EINVAL;
3383
3384         if (slots->generation != ghc->generation) {
3385                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3386                         return -EFAULT;
3387         }
3388
3389         if (kvm_is_error_hva(ghc->hva))
3390                 return -EFAULT;
3391
3392         if (unlikely(!ghc->memslot))
3393                 return kvm_read_guest(kvm, gpa, data, len);
3394
3395         r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3396         if (r)
3397                 return -EFAULT;
3398
3399         return 0;
3400 }
3401 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3402
3403 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3404                           void *data, unsigned long len)
3405 {
3406         return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3407 }
3408 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3409
3410 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3411 {
3412         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3413         gfn_t gfn = gpa >> PAGE_SHIFT;
3414         int seg;
3415         int offset = offset_in_page(gpa);
3416         int ret;
3417
3418         while ((seg = next_segment(len, offset)) != 0) {
3419                 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg);
3420                 if (ret < 0)
3421                         return ret;
3422                 offset = 0;
3423                 len -= seg;
3424                 ++gfn;
3425         }
3426         return 0;
3427 }
3428 EXPORT_SYMBOL_GPL(kvm_clear_guest);
3429
3430 void mark_page_dirty_in_slot(struct kvm *kvm,
3431                              const struct kvm_memory_slot *memslot,
3432                              gfn_t gfn)
3433 {
3434         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3435
3436 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3437         if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
3438                 return;
3439
3440         WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3441 #endif
3442
3443         if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3444                 unsigned long rel_gfn = gfn - memslot->base_gfn;
3445                 u32 slot = (memslot->as_id << 16) | memslot->id;
3446
3447                 if (kvm->dirty_ring_size && vcpu)
3448                         kvm_dirty_ring_push(vcpu, slot, rel_gfn);
3449                 else if (memslot->dirty_bitmap)
3450                         set_bit_le(rel_gfn, memslot->dirty_bitmap);
3451         }
3452 }
3453 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3454
3455 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3456 {
3457         struct kvm_memory_slot *memslot;
3458
3459         memslot = gfn_to_memslot(kvm, gfn);
3460         mark_page_dirty_in_slot(kvm, memslot, gfn);
3461 }
3462 EXPORT_SYMBOL_GPL(mark_page_dirty);
3463
3464 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3465 {
3466         struct kvm_memory_slot *memslot;
3467
3468         memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3469         mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3470 }
3471 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3472
3473 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3474 {
3475         if (!vcpu->sigset_active)
3476                 return;
3477
3478         /*
3479          * This does a lockless modification of ->real_blocked, which is fine
3480          * because, only current can change ->real_blocked and all readers of
3481          * ->real_blocked don't care as long ->real_blocked is always a subset
3482          * of ->blocked.
3483          */
3484         sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3485 }
3486
3487 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3488 {
3489         if (!vcpu->sigset_active)
3490                 return;
3491
3492         sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3493         sigemptyset(&current->real_blocked);
3494 }
3495
3496 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3497 {
3498         unsigned int old, val, grow, grow_start;
3499
3500         old = val = vcpu->halt_poll_ns;
3501         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3502         grow = READ_ONCE(halt_poll_ns_grow);
3503         if (!grow)
3504                 goto out;
3505
3506         val *= grow;
3507         if (val < grow_start)
3508                 val = grow_start;
3509
3510         vcpu->halt_poll_ns = val;
3511 out:
3512         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3513 }
3514
3515 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3516 {
3517         unsigned int old, val, shrink, grow_start;
3518
3519         old = val = vcpu->halt_poll_ns;
3520         shrink = READ_ONCE(halt_poll_ns_shrink);
3521         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3522         if (shrink == 0)
3523                 val = 0;
3524         else
3525                 val /= shrink;
3526
3527         if (val < grow_start)
3528                 val = 0;
3529
3530         vcpu->halt_poll_ns = val;
3531         trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3532 }
3533
3534 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3535 {
3536         int ret = -EINTR;
3537         int idx = srcu_read_lock(&vcpu->kvm->srcu);
3538
3539         if (kvm_arch_vcpu_runnable(vcpu))
3540                 goto out;
3541         if (kvm_cpu_has_pending_timer(vcpu))
3542                 goto out;
3543         if (signal_pending(current))
3544                 goto out;
3545         if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3546                 goto out;
3547
3548         ret = 0;
3549 out:
3550         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3551         return ret;
3552 }
3553
3554 /*
3555  * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3556  * pending.  This is mostly used when halting a vCPU, but may also be used
3557  * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3558  */
3559 bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3560 {
3561         struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3562         bool waited = false;
3563
3564         vcpu->stat.generic.blocking = 1;
3565
3566         preempt_disable();
3567         kvm_arch_vcpu_blocking(vcpu);
3568         prepare_to_rcuwait(wait);
3569         preempt_enable();
3570
3571         for (;;) {
3572                 set_current_state(TASK_INTERRUPTIBLE);
3573
3574                 if (kvm_vcpu_check_block(vcpu) < 0)
3575                         break;
3576
3577                 waited = true;
3578                 schedule();
3579         }
3580
3581         preempt_disable();
3582         finish_rcuwait(wait);
3583         kvm_arch_vcpu_unblocking(vcpu);
3584         preempt_enable();
3585
3586         vcpu->stat.generic.blocking = 0;
3587
3588         return waited;
3589 }
3590
3591 static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3592                                           ktime_t end, bool success)
3593 {
3594         struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3595         u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3596
3597         ++vcpu->stat.generic.halt_attempted_poll;
3598
3599         if (success) {
3600                 ++vcpu->stat.generic.halt_successful_poll;
3601
3602                 if (!vcpu_valid_wakeup(vcpu))
3603                         ++vcpu->stat.generic.halt_poll_invalid;
3604
3605                 stats->halt_poll_success_ns += poll_ns;
3606                 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3607         } else {
3608                 stats->halt_poll_fail_ns += poll_ns;
3609                 KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3610         }
3611 }
3612
3613 static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3614 {
3615         struct kvm *kvm = vcpu->kvm;
3616
3617         if (kvm->override_halt_poll_ns) {
3618                 /*
3619                  * Ensure kvm->max_halt_poll_ns is not read before
3620                  * kvm->override_halt_poll_ns.
3621                  *
3622                  * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3623                  */
3624                 smp_rmb();
3625                 return READ_ONCE(kvm->max_halt_poll_ns);
3626         }
3627
3628         return READ_ONCE(halt_poll_ns);
3629 }
3630
3631 /*
3632  * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
3633  * polling is enabled, busy wait for a short time before blocking to avoid the
3634  * expensive block+unblock sequence if a wake event arrives soon after the vCPU
3635  * is halted.
3636  */
3637 void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3638 {
3639         unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3640         bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3641         ktime_t start, cur, poll_end;
3642         bool waited = false;
3643         bool do_halt_poll;
3644         u64 halt_ns;
3645
3646         if (vcpu->halt_poll_ns > max_halt_poll_ns)
3647                 vcpu->halt_poll_ns = max_halt_poll_ns;
3648
3649         do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3650
3651         start = cur = poll_end = ktime_get();
3652         if (do_halt_poll) {
3653                 ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3654
3655                 do {
3656                         if (kvm_vcpu_check_block(vcpu) < 0)
3657                                 goto out;
3658                         cpu_relax();
3659                         poll_end = cur = ktime_get();
3660                 } while (kvm_vcpu_can_poll(cur, stop));
3661         }
3662
3663         waited = kvm_vcpu_block(vcpu);
3664
3665         cur = ktime_get();
3666         if (waited) {
3667                 vcpu->stat.generic.halt_wait_ns +=
3668                         ktime_to_ns(cur) - ktime_to_ns(poll_end);
3669                 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3670                                 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3671         }
3672 out:
3673         /* The total time the vCPU was "halted", including polling time. */
3674         halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3675
3676         /*
3677          * Note, halt-polling is considered successful so long as the vCPU was
3678          * never actually scheduled out, i.e. even if the wake event arrived
3679          * after of the halt-polling loop itself, but before the full wait.
3680          */
3681         if (do_halt_poll)
3682                 update_halt_poll_stats(vcpu, start, poll_end, !waited);
3683
3684         if (halt_poll_allowed) {
3685                 /* Recompute the max halt poll time in case it changed. */
3686                 max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3687
3688                 if (!vcpu_valid_wakeup(vcpu)) {
3689                         shrink_halt_poll_ns(vcpu);
3690                 } else if (max_halt_poll_ns) {
3691                         if (halt_ns <= vcpu->halt_poll_ns)
3692                                 ;
3693                         /* we had a long block, shrink polling */
3694                         else if (vcpu->halt_poll_ns &&
3695                                  halt_ns > max_halt_poll_ns)
3696                                 shrink_halt_poll_ns(vcpu);
3697                         /* we had a short halt and our poll time is too small */
3698                         else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3699                                  halt_ns < max_halt_poll_ns)
3700                                 grow_halt_poll_ns(vcpu);
3701                 } else {
3702                         vcpu->halt_poll_ns = 0;
3703                 }
3704         }
3705
3706         trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
3707 }
3708 EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3709
3710 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3711 {
3712         if (__kvm_vcpu_wake_up(vcpu)) {
3713                 WRITE_ONCE(vcpu->ready, true);
3714                 ++vcpu->stat.generic.halt_wakeup;
3715                 return true;
3716         }
3717
3718         return false;
3719 }
3720 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3721
3722 #ifndef CONFIG_S390
3723 /*
3724  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3725  */
3726 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3727 {
3728         int me, cpu;
3729
3730         if (kvm_vcpu_wake_up(vcpu))
3731                 return;
3732
3733         me = get_cpu();
3734         /*
3735          * The only state change done outside the vcpu mutex is IN_GUEST_MODE
3736          * to EXITING_GUEST_MODE.  Therefore the moderately expensive "should
3737          * kick" check does not need atomic operations if kvm_vcpu_kick is used
3738          * within the vCPU thread itself.
3739          */
3740         if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3741                 if (vcpu->mode == IN_GUEST_MODE)
3742                         WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3743                 goto out;
3744         }
3745
3746         /*
3747          * Note, the vCPU could get migrated to a different pCPU at any point
3748          * after kvm_arch_vcpu_should_kick(), which could result in sending an
3749          * IPI to the previous pCPU.  But, that's ok because the purpose of the
3750          * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3751          * vCPU also requires it to leave IN_GUEST_MODE.
3752          */
3753         if (kvm_arch_vcpu_should_kick(vcpu)) {
3754                 cpu = READ_ONCE(vcpu->cpu);
3755                 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3756                         smp_send_reschedule(cpu);
3757         }
3758 out:
3759         put_cpu();
3760 }
3761 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3762 #endif /* !CONFIG_S390 */
3763
3764 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3765 {
3766         struct task_struct *task = NULL;
3767         int ret;
3768
3769         if (!read_trylock(&target->pid_lock))
3770                 return 0;
3771
3772         if (target->pid)
3773                 task = get_pid_task(target->pid, PIDTYPE_PID);
3774
3775         read_unlock(&target->pid_lock);
3776
3777         if (!task)
3778                 return 0;
3779         ret = yield_to(task, 1);
3780         put_task_struct(task);
3781
3782         return ret;
3783 }
3784 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3785
3786 /*
3787  * Helper that checks whether a VCPU is eligible for directed yield.
3788  * Most eligible candidate to yield is decided by following heuristics:
3789  *
3790  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3791  *  (preempted lock holder), indicated by @in_spin_loop.
3792  *  Set at the beginning and cleared at the end of interception/PLE handler.
3793  *
3794  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3795  *  chance last time (mostly it has become eligible now since we have probably
3796  *  yielded to lockholder in last iteration. This is done by toggling
3797  *  @dy_eligible each time a VCPU checked for eligibility.)
3798  *
3799  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3800  *  to preempted lock-holder could result in wrong VCPU selection and CPU
3801  *  burning. Giving priority for a potential lock-holder increases lock
3802  *  progress.
3803  *
3804  *  Since algorithm is based on heuristics, accessing another VCPU data without
3805  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3806  *  and continue with next VCPU and so on.
3807  */
3808 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3809 {
3810 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3811         bool eligible;
3812
3813         eligible = !vcpu->spin_loop.in_spin_loop ||
3814                     vcpu->spin_loop.dy_eligible;
3815
3816         if (vcpu->spin_loop.in_spin_loop)
3817                 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3818
3819         return eligible;
3820 #else
3821         return true;
3822 #endif
3823 }
3824
3825 /*
3826  * Unlike kvm_arch_vcpu_runnable, this function is called outside
3827  * a vcpu_load/vcpu_put pair.  However, for most architectures
3828  * kvm_arch_vcpu_runnable does not require vcpu_load.
3829  */
3830 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3831 {
3832         return kvm_arch_vcpu_runnable(vcpu);
3833 }
3834
3835 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3836 {
3837         if (kvm_arch_dy_runnable(vcpu))
3838                 return true;
3839
3840 #ifdef CONFIG_KVM_ASYNC_PF
3841         if (!list_empty_careful(&vcpu->async_pf.done))
3842                 return true;
3843 #endif
3844
3845         return false;
3846 }
3847
3848 /*
3849  * By default, simply query the target vCPU's current mode when checking if a
3850  * vCPU was preempted in kernel mode.  All architectures except x86 (or more
3851  * specifical, except VMX) allow querying whether or not a vCPU is in kernel
3852  * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
3853  * directly for cross-vCPU checks is functionally correct and accurate.
3854  */
3855 bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
3856 {
3857         return kvm_arch_vcpu_in_kernel(vcpu);
3858 }
3859
3860 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3861 {
3862         return false;
3863 }
3864
3865 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3866 {
3867         int nr_vcpus, start, i, idx, yielded;
3868         struct kvm *kvm = me->kvm;
3869         struct kvm_vcpu *vcpu;
3870         int try = 3;
3871
3872         nr_vcpus = atomic_read(&kvm->online_vcpus);
3873         if (nr_vcpus < 2)
3874                 return;
3875
3876         /* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */
3877         smp_rmb();
3878
3879         kvm_vcpu_set_in_spin_loop(me, true);
3880
3881         /*
3882          * The current vCPU ("me") is spinning in kernel mode, i.e. is likely
3883          * waiting for a resource to become available.  Attempt to yield to a
3884          * vCPU that is runnable, but not currently running, e.g. because the
3885          * vCPU was preempted by a higher priority task.  With luck, the vCPU
3886          * that was preempted is holding a lock or some other resource that the
3887          * current vCPU is waiting to acquire, and yielding to the other vCPU
3888          * will allow it to make forward progress and release the lock (or kick
3889          * the spinning vCPU, etc).
3890          *
3891          * Since KVM has no insight into what exactly the guest is doing,
3892          * approximate a round-robin selection by iterating over all vCPUs,
3893          * starting at the last boosted vCPU.  I.e. if N=kvm->last_boosted_vcpu,
3894          * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed.
3895          *
3896          * Note, this is inherently racy, e.g. if multiple vCPUs are spinning,
3897          * they may all try to yield to the same vCPU(s).  But as above, this
3898          * is all best effort due to KVM's lack of visibility into the guest.
3899          */
3900         start = READ_ONCE(kvm->last_boosted_vcpu) + 1;
3901         for (i = 0; i < nr_vcpus; i++) {
3902                 idx = (start + i) % nr_vcpus;
3903                 if (idx == me->vcpu_idx)
3904                         continue;
3905
3906                 vcpu = xa_load(&kvm->vcpu_array, idx);
3907                 if (!READ_ONCE(vcpu->ready))
3908                         continue;
3909                 if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3910                         continue;
3911
3912                 /*
3913                  * Treat the target vCPU as being in-kernel if it has a pending
3914                  * interrupt, as the vCPU trying to yield may be spinning
3915                  * waiting on IPI delivery, i.e. the target vCPU is in-kernel
3916                  * for the purposes of directed yield.
3917                  */
3918                 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3919                     !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3920                     !kvm_arch_vcpu_preempted_in_kernel(vcpu))
3921                         continue;
3922
3923                 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3924                         continue;
3925
3926                 yielded = kvm_vcpu_yield_to(vcpu);
3927                 if (yielded > 0) {
3928                         WRITE_ONCE(kvm->last_boosted_vcpu, i);
3929                         break;
3930                 } else if (yielded < 0 && !--try) {
3931                         break;
3932                 }
3933         }
3934         kvm_vcpu_set_in_spin_loop(me, false);
3935
3936         /* Ensure vcpu is not eligible during next spinloop */
3937         kvm_vcpu_set_dy_eligible(me, false);
3938 }
3939 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3940
3941 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3942 {
3943 #ifdef CONFIG_HAVE_KVM_DIRTY_RING
3944         return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3945             (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3946              kvm->dirty_ring_size / PAGE_SIZE);
3947 #else
3948         return false;
3949 #endif
3950 }
3951
3952 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3953 {
3954         struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3955         struct page *page;
3956
3957         if (vmf->pgoff == 0)
3958                 page = virt_to_page(vcpu->run);
3959 #ifdef CONFIG_X86
3960         else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3961                 page = virt_to_page(vcpu->arch.pio_data);
3962 #endif
3963 #ifdef CONFIG_KVM_MMIO
3964         else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3965                 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3966 #endif
3967         else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3968                 page = kvm_dirty_ring_get_page(
3969                     &vcpu->dirty_ring,
3970                     vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3971         else
3972                 return kvm_arch_vcpu_fault(vcpu, vmf);
3973         get_page(page);
3974         vmf->page = page;
3975         return 0;
3976 }
3977
3978 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3979         .fault = kvm_vcpu_fault,
3980 };
3981
3982 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3983 {
3984         struct kvm_vcpu *vcpu = file->private_data;
3985         unsigned long pages = vma_pages(vma);
3986
3987         if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3988              kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3989             ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3990                 return -EINVAL;
3991
3992         vma->vm_ops = &kvm_vcpu_vm_ops;
3993         return 0;
3994 }
3995
3996 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3997 {
3998         struct kvm_vcpu *vcpu = filp->private_data;
3999
4000         kvm_put_kvm(vcpu->kvm);
4001         return 0;
4002 }
4003
4004 static struct file_operations kvm_vcpu_fops = {
4005         .release        = kvm_vcpu_release,
4006         .unlocked_ioctl = kvm_vcpu_ioctl,
4007         .mmap           = kvm_vcpu_mmap,
4008         .llseek         = noop_llseek,
4009         KVM_COMPAT(kvm_vcpu_compat_ioctl),
4010 };
4011
4012 /*
4013  * Allocates an inode for the vcpu.
4014  */
4015 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
4016 {
4017         char name[8 + 1 + ITOA_MAX_LEN + 1];
4018
4019         snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
4020         return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
4021 }
4022
4023 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
4024 static int vcpu_get_pid(void *data, u64 *val)
4025 {
4026         struct kvm_vcpu *vcpu = data;
4027
4028         read_lock(&vcpu->pid_lock);
4029         *val = pid_nr(vcpu->pid);
4030         read_unlock(&vcpu->pid_lock);
4031         return 0;
4032 }
4033
4034 DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
4035
4036 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
4037 {
4038         struct dentry *debugfs_dentry;
4039         char dir_name[ITOA_MAX_LEN * 2];
4040
4041         if (!debugfs_initialized())
4042                 return;
4043
4044         snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
4045         debugfs_dentry = debugfs_create_dir(dir_name,
4046                                             vcpu->kvm->debugfs_dentry);
4047         debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
4048                             &vcpu_get_pid_fops);
4049
4050         kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
4051 }
4052 #endif
4053
4054 /*
4055  * Creates some virtual cpus.  Good luck creating more than one.
4056  */
4057 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
4058 {
4059         int r;
4060         struct kvm_vcpu *vcpu;
4061         struct page *page;
4062
4063         /*
4064          * KVM tracks vCPU IDs as 'int', be kind to userspace and reject
4065          * too-large values instead of silently truncating.
4066          *
4067          * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
4068          * changing the storage type (at the very least, IDs should be tracked
4069          * as unsigned ints).
4070          */
4071         BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
4072         if (id >= KVM_MAX_VCPU_IDS)
4073                 return -EINVAL;
4074
4075         mutex_lock(&kvm->lock);
4076         if (kvm->created_vcpus >= kvm->max_vcpus) {
4077                 mutex_unlock(&kvm->lock);
4078                 return -EINVAL;
4079         }
4080
4081         r = kvm_arch_vcpu_precreate(kvm, id);
4082         if (r) {
4083                 mutex_unlock(&kvm->lock);
4084                 return r;
4085         }
4086
4087         kvm->created_vcpus++;
4088         mutex_unlock(&kvm->lock);
4089
4090         vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
4091         if (!vcpu) {
4092                 r = -ENOMEM;
4093                 goto vcpu_decrement;
4094         }
4095
4096         BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
4097         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
4098         if (!page) {
4099                 r = -ENOMEM;
4100                 goto vcpu_free;
4101         }
4102         vcpu->run = page_address(page);
4103
4104         kvm_vcpu_init(vcpu, kvm, id);
4105
4106         r = kvm_arch_vcpu_create(vcpu);
4107         if (r)
4108                 goto vcpu_free_run_page;
4109
4110         if (kvm->dirty_ring_size) {
4111                 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
4112                                          id, kvm->dirty_ring_size);
4113                 if (r)
4114                         goto arch_vcpu_destroy;
4115         }
4116
4117         mutex_lock(&kvm->lock);
4118
4119 #ifdef CONFIG_LOCKDEP
4120         /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
4121         mutex_lock(&vcpu->mutex);
4122         mutex_unlock(&vcpu->mutex);
4123 #endif
4124
4125         if (kvm_get_vcpu_by_id(kvm, id)) {
4126                 r = -EEXIST;
4127                 goto unlock_vcpu_destroy;
4128         }
4129
4130         vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
4131         r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
4132         if (r)
4133                 goto unlock_vcpu_destroy;
4134
4135         /* Now it's all set up, let userspace reach it */
4136         kvm_get_kvm(kvm);
4137         r = create_vcpu_fd(vcpu);
4138         if (r < 0)
4139                 goto kvm_put_xa_release;
4140
4141         if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
4142                 r = -EINVAL;
4143                 goto kvm_put_xa_release;
4144         }
4145
4146         /*
4147          * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
4148          * pointer before kvm->online_vcpu's incremented value.
4149          */
4150         smp_wmb();
4151         atomic_inc(&kvm->online_vcpus);
4152
4153         mutex_unlock(&kvm->lock);
4154         kvm_arch_vcpu_postcreate(vcpu);
4155         kvm_create_vcpu_debugfs(vcpu);
4156         return r;
4157
4158 kvm_put_xa_release:
4159         kvm_put_kvm_no_destroy(kvm);
4160         xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
4161 unlock_vcpu_destroy:
4162         mutex_unlock(&kvm->lock);
4163         kvm_dirty_ring_free(&vcpu->dirty_ring);
4164 arch_vcpu_destroy:
4165         kvm_arch_vcpu_destroy(vcpu);
4166 vcpu_free_run_page:
4167         free_page((unsigned long)vcpu->run);
4168 vcpu_free:
4169         kmem_cache_free(kvm_vcpu_cache, vcpu);
4170 vcpu_decrement:
4171         mutex_lock(&kvm->lock);
4172         kvm->created_vcpus--;
4173         mutex_unlock(&kvm->lock);
4174         return r;
4175 }
4176
4177 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
4178 {
4179         if (sigset) {
4180                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
4181                 vcpu->sigset_active = 1;
4182                 vcpu->sigset = *sigset;
4183         } else
4184                 vcpu->sigset_active = 0;
4185         return 0;
4186 }
4187
4188 static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
4189                               size_t size, loff_t *offset)
4190 {
4191         struct kvm_vcpu *vcpu = file->private_data;
4192
4193         return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
4194                         &kvm_vcpu_stats_desc[0], &vcpu->stat,
4195                         sizeof(vcpu->stat), user_buffer, size, offset);
4196 }
4197
4198 static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
4199 {
4200         struct kvm_vcpu *vcpu = file->private_data;
4201
4202         kvm_put_kvm(vcpu->kvm);
4203         return 0;
4204 }
4205
4206 static const struct file_operations kvm_vcpu_stats_fops = {
4207         .owner = THIS_MODULE,
4208         .read = kvm_vcpu_stats_read,
4209         .release = kvm_vcpu_stats_release,
4210         .llseek = noop_llseek,
4211 };
4212
4213 static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4214 {
4215         int fd;
4216         struct file *file;
4217         char name[15 + ITOA_MAX_LEN + 1];
4218
4219         snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4220
4221         fd = get_unused_fd_flags(O_CLOEXEC);
4222         if (fd < 0)
4223                 return fd;
4224
4225         file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
4226         if (IS_ERR(file)) {
4227                 put_unused_fd(fd);
4228                 return PTR_ERR(file);
4229         }
4230
4231         kvm_get_kvm(vcpu->kvm);
4232
4233         file->f_mode |= FMODE_PREAD;
4234         fd_install(fd, file);
4235
4236         return fd;
4237 }
4238
4239 #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
4240 static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
4241                                      struct kvm_pre_fault_memory *range)
4242 {
4243         int idx;
4244         long r;
4245         u64 full_size;
4246
4247         if (range->flags)
4248                 return -EINVAL;
4249
4250         if (!PAGE_ALIGNED(range->gpa) ||
4251             !PAGE_ALIGNED(range->size) ||
4252             range->gpa + range->size <= range->gpa)
4253                 return -EINVAL;
4254
4255         vcpu_load(vcpu);
4256         idx = srcu_read_lock(&vcpu->kvm->srcu);
4257
4258         full_size = range->size;
4259         do {
4260                 if (signal_pending(current)) {
4261                         r = -EINTR;
4262                         break;
4263                 }
4264
4265                 r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
4266                 if (WARN_ON_ONCE(r == 0 || r == -EIO))
4267                         break;
4268
4269                 if (r < 0)
4270                         break;
4271
4272                 range->size -= r;
4273                 range->gpa += r;
4274                 cond_resched();
4275         } while (range->size);
4276
4277         srcu_read_unlock(&vcpu->kvm->srcu, idx);
4278         vcpu_put(vcpu);
4279
4280         /* Return success if at least one page was mapped successfully.  */
4281         return full_size == range->size ? r : 0;
4282 }
4283 #endif
4284
4285 static long kvm_vcpu_ioctl(struct file *filp,
4286                            unsigned int ioctl, unsigned long arg)
4287 {
4288         struct kvm_vcpu *vcpu = filp->private_data;
4289         void __user *argp = (void __user *)arg;
4290         int r;
4291         struct kvm_fpu *fpu = NULL;
4292         struct kvm_sregs *kvm_sregs = NULL;
4293
4294         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4295                 return -EIO;
4296
4297         if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4298                 return -EINVAL;
4299
4300         /*
4301          * Some architectures have vcpu ioctls that are asynchronous to vcpu
4302          * execution; mutex_lock() would break them.
4303          */
4304         r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4305         if (r != -ENOIOCTLCMD)
4306                 return r;
4307
4308         if (mutex_lock_killable(&vcpu->mutex))
4309                 return -EINTR;
4310         switch (ioctl) {
4311         case KVM_RUN: {
4312                 struct pid *oldpid;
4313                 r = -EINVAL;
4314                 if (arg)
4315                         goto out;
4316
4317                 /*
4318                  * Note, vcpu->pid is primarily protected by vcpu->mutex. The
4319                  * dedicated r/w lock allows other tasks, e.g. other vCPUs, to
4320                  * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield
4321                  * directly to this vCPU
4322                  */
4323                 oldpid = vcpu->pid;
4324                 if (unlikely(oldpid != task_pid(current))) {
4325                         /* The thread running this VCPU changed. */
4326                         struct pid *newpid;
4327
4328                         r = kvm_arch_vcpu_run_pid_change(vcpu);
4329                         if (r)
4330                                 break;
4331
4332                         newpid = get_task_pid(current, PIDTYPE_PID);
4333                         write_lock(&vcpu->pid_lock);
4334                         vcpu->pid = newpid;
4335                         write_unlock(&vcpu->pid_lock);
4336
4337                         put_pid(oldpid);
4338                 }
4339                 vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
4340                 r = kvm_arch_vcpu_ioctl_run(vcpu);
4341                 vcpu->wants_to_run = false;
4342
4343                 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
4344                 break;
4345         }
4346         case KVM_GET_REGS: {
4347                 struct kvm_regs *kvm_regs;
4348
4349                 r = -ENOMEM;
4350                 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
4351                 if (!kvm_regs)
4352                         goto out;
4353                 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
4354                 if (r)
4355                         goto out_free1;
4356                 r = -EFAULT;
4357                 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
4358                         goto out_free1;
4359                 r = 0;
4360 out_free1:
4361                 kfree(kvm_regs);
4362                 break;
4363         }
4364         case KVM_SET_REGS: {
4365                 struct kvm_regs *kvm_regs;
4366
4367                 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4368                 if (IS_ERR(kvm_regs)) {
4369                         r = PTR_ERR(kvm_regs);
4370                         goto out;
4371                 }
4372                 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
4373                 kfree(kvm_regs);
4374                 break;
4375         }
4376         case KVM_GET_SREGS: {
4377                 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
4378                 r = -ENOMEM;
4379                 if (!kvm_sregs)
4380                         goto out;
4381                 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
4382                 if (r)
4383                         goto out;
4384                 r = -EFAULT;
4385                 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
4386                         goto out;
4387                 r = 0;
4388                 break;
4389         }
4390         case KVM_SET_SREGS: {
4391                 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4392                 if (IS_ERR(kvm_sregs)) {
4393                         r = PTR_ERR(kvm_sregs);
4394                         kvm_sregs = NULL;
4395                         goto out;
4396                 }
4397                 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
4398                 break;
4399         }
4400         case KVM_GET_MP_STATE: {
4401                 struct kvm_mp_state mp_state;
4402
4403                 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
4404                 if (r)
4405                         goto out;
4406                 r = -EFAULT;
4407                 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
4408                         goto out;
4409                 r = 0;
4410                 break;
4411         }
4412         case KVM_SET_MP_STATE: {
4413                 struct kvm_mp_state mp_state;
4414
4415                 r = -EFAULT;
4416                 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
4417                         goto out;
4418                 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
4419                 break;
4420         }
4421         case KVM_TRANSLATE: {
4422                 struct kvm_translation tr;
4423
4424                 r = -EFAULT;
4425                 if (copy_from_user(&tr, argp, sizeof(tr)))
4426                         goto out;
4427                 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
4428                 if (r)
4429                         goto out;
4430                 r = -EFAULT;
4431                 if (copy_to_user(argp, &tr, sizeof(tr)))
4432                         goto out;
4433                 r = 0;
4434                 break;
4435         }
4436         case KVM_SET_GUEST_DEBUG: {
4437                 struct kvm_guest_debug dbg;
4438
4439                 r = -EFAULT;
4440                 if (copy_from_user(&dbg, argp, sizeof(dbg)))
4441                         goto out;
4442                 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
4443                 break;
4444         }
4445         case KVM_SET_SIGNAL_MASK: {
4446                 struct kvm_signal_mask __user *sigmask_arg = argp;
4447                 struct kvm_signal_mask kvm_sigmask;
4448                 sigset_t sigset, *p;
4449
4450                 p = NULL;
4451                 if (argp) {
4452                         r = -EFAULT;
4453                         if (copy_from_user(&kvm_sigmask, argp,
4454                                            sizeof(kvm_sigmask)))
4455                                 goto out;
4456                         r = -EINVAL;
4457                         if (kvm_sigmask.len != sizeof(sigset))
4458                                 goto out;
4459                         r = -EFAULT;
4460                         if (copy_from_user(&sigset, sigmask_arg->sigset,
4461                                            sizeof(sigset)))
4462                                 goto out;
4463                         p = &sigset;
4464                 }
4465                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
4466                 break;
4467         }
4468         case KVM_GET_FPU: {
4469                 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
4470                 r = -ENOMEM;
4471                 if (!fpu)
4472                         goto out;
4473                 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4474                 if (r)
4475                         goto out;
4476                 r = -EFAULT;
4477                 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
4478                         goto out;
4479                 r = 0;
4480                 break;
4481         }
4482         case KVM_SET_FPU: {
4483                 fpu = memdup_user(argp, sizeof(*fpu));
4484                 if (IS_ERR(fpu)) {
4485                         r = PTR_ERR(fpu);
4486                         fpu = NULL;
4487                         goto out;
4488                 }
4489                 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4490                 break;
4491         }
4492         case KVM_GET_STATS_FD: {
4493                 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4494                 break;
4495         }
4496 #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
4497         case KVM_PRE_FAULT_MEMORY: {
4498                 struct kvm_pre_fault_memory range;
4499
4500                 r = -EFAULT;
4501                 if (copy_from_user(&range, argp, sizeof(range)))
4502                         break;
4503                 r = kvm_vcpu_pre_fault_memory(vcpu, &range);
4504                 /* Pass back leftover range. */
4505                 if (copy_to_user(argp, &range, sizeof(range)))
4506                         r = -EFAULT;
4507                 break;
4508         }
4509 #endif
4510         default:
4511                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4512         }
4513 out:
4514         mutex_unlock(&vcpu->mutex);
4515         kfree(fpu);
4516         kfree(kvm_sregs);
4517         return r;
4518 }
4519
4520 #ifdef CONFIG_KVM_COMPAT
4521 static long kvm_vcpu_compat_ioctl(struct file *filp,
4522                                   unsigned int ioctl, unsigned long arg)
4523 {
4524         struct kvm_vcpu *vcpu = filp->private_data;
4525         void __user *argp = compat_ptr(arg);
4526         int r;
4527
4528         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
4529                 return -EIO;
4530
4531         switch (ioctl) {
4532         case KVM_SET_SIGNAL_MASK: {
4533                 struct kvm_signal_mask __user *sigmask_arg = argp;
4534                 struct kvm_signal_mask kvm_sigmask;
4535                 sigset_t sigset;
4536
4537                 if (argp) {
4538                         r = -EFAULT;
4539                         if (copy_from_user(&kvm_sigmask, argp,
4540                                            sizeof(kvm_sigmask)))
4541                                 goto out;
4542                         r = -EINVAL;
4543                         if (kvm_sigmask.len != sizeof(compat_sigset_t))
4544                                 goto out;
4545                         r = -EFAULT;
4546                         if (get_compat_sigset(&sigset,
4547                                               (compat_sigset_t __user *)sigmask_arg->sigset))
4548                                 goto out;
4549                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4550                 } else
4551                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4552                 break;
4553         }
4554         default:
4555                 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4556         }
4557
4558 out:
4559         return r;
4560 }
4561 #endif
4562
4563 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4564 {
4565         struct kvm_device *dev = filp->private_data;
4566
4567         if (dev->ops->mmap)
4568                 return dev->ops->mmap(dev, vma);
4569
4570         return -ENODEV;
4571 }
4572
4573 static int kvm_device_ioctl_attr(struct kvm_device *dev,
4574                                  int (*accessor)(struct kvm_device *dev,
4575                                                  struct kvm_device_attr *attr),
4576                                  unsigned long arg)
4577 {
4578         struct kvm_device_attr attr;
4579
4580         if (!accessor)
4581                 return -EPERM;
4582
4583         if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4584                 return -EFAULT;
4585
4586         return accessor(dev, &attr);
4587 }
4588
4589 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4590                              unsigned long arg)
4591 {
4592         struct kvm_device *dev = filp->private_data;
4593
4594         if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
4595                 return -EIO;
4596
4597         switch (ioctl) {
4598         case KVM_SET_DEVICE_ATTR:
4599                 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4600         case KVM_GET_DEVICE_ATTR:
4601                 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4602         case KVM_HAS_DEVICE_ATTR:
4603                 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4604         default:
4605                 if (dev->ops->ioctl)
4606                         return dev->ops->ioctl(dev, ioctl, arg);
4607
4608                 return -ENOTTY;
4609         }
4610 }
4611
4612 static int kvm_device_release(struct inode *inode, struct file *filp)
4613 {
4614         struct kvm_device *dev = filp->private_data;
4615         struct kvm *kvm = dev->kvm;
4616
4617         if (dev->ops->release) {
4618                 mutex_lock(&kvm->lock);
4619                 list_del_rcu(&dev->vm_node);
4620                 synchronize_rcu();
4621                 dev->ops->release(dev);
4622                 mutex_unlock(&kvm->lock);
4623         }
4624
4625         kvm_put_kvm(kvm);
4626         return 0;
4627 }
4628
4629 static struct file_operations kvm_device_fops = {
4630         .unlocked_ioctl = kvm_device_ioctl,
4631         .release = kvm_device_release,
4632         KVM_COMPAT(kvm_device_ioctl),
4633         .mmap = kvm_device_mmap,
4634 };
4635
4636 struct kvm_device *kvm_device_from_filp(struct file *filp)
4637 {
4638         if (filp->f_op != &kvm_device_fops)
4639                 return NULL;
4640
4641         return filp->private_data;
4642 }
4643
4644 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4645 #ifdef CONFIG_KVM_MPIC
4646         [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
4647         [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
4648 #endif
4649 };
4650
4651 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4652 {
4653         if (type >= ARRAY_SIZE(kvm_device_ops_table))
4654                 return -ENOSPC;
4655
4656         if (kvm_device_ops_table[type] != NULL)
4657                 return -EEXIST;
4658
4659         kvm_device_ops_table[type] = ops;
4660         return 0;
4661 }
4662
4663 void kvm_unregister_device_ops(u32 type)
4664 {
4665         if (kvm_device_ops_table[type] != NULL)
4666                 kvm_device_ops_table[type] = NULL;
4667 }
4668
4669 static int kvm_ioctl_create_device(struct kvm *kvm,
4670                                    struct kvm_create_device *cd)
4671 {
4672         const struct kvm_device_ops *ops;
4673         struct kvm_device *dev;
4674         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4675         int type;
4676         int ret;
4677
4678         if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4679                 return -ENODEV;
4680
4681         type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4682         ops = kvm_device_ops_table[type];
4683         if (ops == NULL)
4684                 return -ENODEV;
4685
4686         if (test)
4687                 return 0;
4688
4689         dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4690         if (!dev)
4691                 return -ENOMEM;
4692
4693         dev->ops = ops;
4694         dev->kvm = kvm;
4695
4696         mutex_lock(&kvm->lock);
4697         ret = ops->create(dev, type);
4698         if (ret < 0) {
4699                 mutex_unlock(&kvm->lock);
4700                 kfree(dev);
4701                 return ret;
4702         }
4703         list_add_rcu(&dev->vm_node, &kvm->devices);
4704         mutex_unlock(&kvm->lock);
4705
4706         if (ops->init)
4707                 ops->init(dev);
4708
4709         kvm_get_kvm(kvm);
4710         ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4711         if (ret < 0) {
4712                 kvm_put_kvm_no_destroy(kvm);
4713                 mutex_lock(&kvm->lock);
4714                 list_del_rcu(&dev->vm_node);
4715                 synchronize_rcu();
4716                 if (ops->release)
4717                         ops->release(dev);
4718                 mutex_unlock(&kvm->lock);
4719                 if (ops->destroy)
4720                         ops->destroy(dev);
4721                 return ret;
4722         }
4723
4724         cd->fd = ret;
4725         return 0;
4726 }
4727
4728 static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4729 {
4730         switch (arg) {
4731         case KVM_CAP_USER_MEMORY:
4732         case KVM_CAP_USER_MEMORY2:
4733         case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4734         case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4735         case KVM_CAP_INTERNAL_ERROR_DATA:
4736 #ifdef CONFIG_HAVE_KVM_MSI
4737         case KVM_CAP_SIGNAL_MSI:
4738 #endif
4739 #ifdef CONFIG_HAVE_KVM_IRQCHIP
4740         case KVM_CAP_IRQFD:
4741 #endif
4742         case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4743         case KVM_CAP_CHECK_EXTENSION_VM:
4744         case KVM_CAP_ENABLE_CAP_VM:
4745         case KVM_CAP_HALT_POLL:
4746                 return 1;
4747 #ifdef CONFIG_KVM_MMIO
4748         case KVM_CAP_COALESCED_MMIO:
4749                 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4750         case KVM_CAP_COALESCED_PIO:
4751                 return 1;
4752 #endif
4753 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4754         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4755                 return KVM_DIRTY_LOG_MANUAL_CAPS;
4756 #endif
4757 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4758         case KVM_CAP_IRQ_ROUTING:
4759                 return KVM_MAX_IRQ_ROUTES;
4760 #endif
4761 #if KVM_MAX_NR_ADDRESS_SPACES > 1
4762         case KVM_CAP_MULTI_ADDRESS_SPACE:
4763                 if (kvm)
4764                         return kvm_arch_nr_memslot_as_ids(kvm);
4765                 return KVM_MAX_NR_ADDRESS_SPACES;
4766 #endif
4767         case KVM_CAP_NR_MEMSLOTS:
4768                 return KVM_USER_MEM_SLOTS;
4769         case KVM_CAP_DIRTY_LOG_RING:
4770 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4771                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4772 #else
4773                 return 0;
4774 #endif
4775         case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4776 #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4777                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4778 #else
4779                 return 0;
4780 #endif
4781 #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4782         case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
4783 #endif
4784         case KVM_CAP_BINARY_STATS_FD:
4785         case KVM_CAP_SYSTEM_EVENT_DATA:
4786         case KVM_CAP_DEVICE_CTRL:
4787                 return 1;
4788 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
4789         case KVM_CAP_MEMORY_ATTRIBUTES:
4790                 return kvm_supported_mem_attributes(kvm);
4791 #endif
4792 #ifdef CONFIG_KVM_PRIVATE_MEM
4793         case KVM_CAP_GUEST_MEMFD:
4794                 return !kvm || kvm_arch_has_private_mem(kvm);
4795 #endif
4796         default:
4797                 break;
4798         }
4799         return kvm_vm_ioctl_check_extension(kvm, arg);
4800 }
4801
4802 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4803 {
4804         int r;
4805
4806         if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4807                 return -EINVAL;
4808
4809         /* the size should be power of 2 */
4810         if (!size || (size & (size - 1)))
4811                 return -EINVAL;
4812
4813         /* Should be bigger to keep the reserved entries, or a page */
4814         if (size < kvm_dirty_ring_get_rsvd_entries() *
4815             sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4816                 return -EINVAL;
4817
4818         if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4819             sizeof(struct kvm_dirty_gfn))
4820                 return -E2BIG;
4821
4822         /* We only allow it to set once */
4823         if (kvm->dirty_ring_size)
4824                 return -EINVAL;
4825
4826         mutex_lock(&kvm->lock);
4827
4828         if (kvm->created_vcpus) {
4829                 /* We don't allow to change this value after vcpu created */
4830                 r = -EINVAL;
4831         } else {
4832                 kvm->dirty_ring_size = size;
4833                 r = 0;
4834         }
4835
4836         mutex_unlock(&kvm->lock);
4837         return r;
4838 }
4839
4840 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4841 {
4842         unsigned long i;
4843         struct kvm_vcpu *vcpu;
4844         int cleared = 0;
4845
4846         if (!kvm->dirty_ring_size)
4847                 return -EINVAL;
4848
4849         mutex_lock(&kvm->slots_lock);
4850
4851         kvm_for_each_vcpu(i, vcpu, kvm)
4852                 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4853
4854         mutex_unlock(&kvm->slots_lock);
4855
4856         if (cleared)
4857                 kvm_flush_remote_tlbs(kvm);
4858
4859         return cleared;
4860 }
4861
4862 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4863                                                   struct kvm_enable_cap *cap)
4864 {
4865         return -EINVAL;
4866 }
4867
4868 bool kvm_are_all_memslots_empty(struct kvm *kvm)
4869 {
4870         int i;
4871
4872         lockdep_assert_held(&kvm->slots_lock);
4873
4874         for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
4875                 if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
4876                         return false;
4877         }
4878
4879         return true;
4880 }
4881 EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
4882
4883 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4884                                            struct kvm_enable_cap *cap)
4885 {
4886         switch (cap->cap) {
4887 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4888         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4889                 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4890
4891                 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4892                         allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4893
4894                 if (cap->flags || (cap->args[0] & ~allowed_options))
4895                         return -EINVAL;
4896                 kvm->manual_dirty_log_protect = cap->args[0];
4897                 return 0;
4898         }
4899 #endif
4900         case KVM_CAP_HALT_POLL: {
4901                 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4902                         return -EINVAL;
4903
4904                 kvm->max_halt_poll_ns = cap->args[0];
4905
4906                 /*
4907                  * Ensure kvm->override_halt_poll_ns does not become visible
4908                  * before kvm->max_halt_poll_ns.
4909                  *
4910                  * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
4911                  */
4912                 smp_wmb();
4913                 kvm->override_halt_poll_ns = true;
4914
4915                 return 0;
4916         }
4917         case KVM_CAP_DIRTY_LOG_RING:
4918         case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4919                 if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
4920                         return -EINVAL;
4921
4922                 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4923         case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
4924                 int r = -EINVAL;
4925
4926                 if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
4927                     !kvm->dirty_ring_size || cap->flags)
4928                         return r;
4929
4930                 mutex_lock(&kvm->slots_lock);
4931
4932                 /*
4933                  * For simplicity, allow enabling ring+bitmap if and only if
4934                  * there are no memslots, e.g. to ensure all memslots allocate
4935                  * a bitmap after the capability is enabled.
4936                  */
4937                 if (kvm_are_all_memslots_empty(kvm)) {
4938                         kvm->dirty_ring_with_bitmap = true;
4939                         r = 0;
4940                 }
4941
4942                 mutex_unlock(&kvm->slots_lock);
4943
4944                 return r;
4945         }
4946         default:
4947                 return kvm_vm_ioctl_enable_cap(kvm, cap);
4948         }
4949 }
4950
4951 static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4952                               size_t size, loff_t *offset)
4953 {
4954         struct kvm *kvm = file->private_data;
4955
4956         return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4957                                 &kvm_vm_stats_desc[0], &kvm->stat,
4958                                 sizeof(kvm->stat), user_buffer, size, offset);
4959 }
4960
4961 static int kvm_vm_stats_release(struct inode *inode, struct file *file)
4962 {
4963         struct kvm *kvm = file->private_data;
4964
4965         kvm_put_kvm(kvm);
4966         return 0;
4967 }
4968
4969 static const struct file_operations kvm_vm_stats_fops = {
4970         .owner = THIS_MODULE,
4971         .read = kvm_vm_stats_read,
4972         .release = kvm_vm_stats_release,
4973         .llseek = noop_llseek,
4974 };
4975
4976 static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4977 {
4978         int fd;
4979         struct file *file;
4980
4981         fd = get_unused_fd_flags(O_CLOEXEC);
4982         if (fd < 0)
4983                 return fd;
4984
4985         file = anon_inode_getfile("kvm-vm-stats",
4986                         &kvm_vm_stats_fops, kvm, O_RDONLY);
4987         if (IS_ERR(file)) {
4988                 put_unused_fd(fd);
4989                 return PTR_ERR(file);
4990         }
4991
4992         kvm_get_kvm(kvm);
4993
4994         file->f_mode |= FMODE_PREAD;
4995         fd_install(fd, file);
4996
4997         return fd;
4998 }
4999
5000 #define SANITY_CHECK_MEM_REGION_FIELD(field)                                    \
5001 do {                                                                            \
5002         BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) !=             \
5003                      offsetof(struct kvm_userspace_memory_region2, field));     \
5004         BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) !=         \
5005                      sizeof_field(struct kvm_userspace_memory_region2, field)); \
5006 } while (0)
5007
5008 static long kvm_vm_ioctl(struct file *filp,
5009                            unsigned int ioctl, unsigned long arg)
5010 {
5011         struct kvm *kvm = filp->private_data;
5012         void __user *argp = (void __user *)arg;
5013         int r;
5014
5015         if (kvm->mm != current->mm || kvm->vm_dead)
5016                 return -EIO;
5017         switch (ioctl) {
5018         case KVM_CREATE_VCPU:
5019                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
5020                 break;
5021         case KVM_ENABLE_CAP: {
5022                 struct kvm_enable_cap cap;
5023
5024                 r = -EFAULT;
5025                 if (copy_from_user(&cap, argp, sizeof(cap)))
5026                         goto out;
5027                 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
5028                 break;
5029         }
5030         case KVM_SET_USER_MEMORY_REGION2:
5031         case KVM_SET_USER_MEMORY_REGION: {
5032                 struct kvm_userspace_memory_region2 mem;
5033                 unsigned long size;
5034
5035                 if (ioctl == KVM_SET_USER_MEMORY_REGION) {
5036                         /*
5037                          * Fields beyond struct kvm_userspace_memory_region shouldn't be
5038                          * accessed, but avoid leaking kernel memory in case of a bug.
5039                          */
5040                         memset(&mem, 0, sizeof(mem));
5041                         size = sizeof(struct kvm_userspace_memory_region);
5042                 } else {
5043                         size = sizeof(struct kvm_userspace_memory_region2);
5044                 }
5045
5046                 /* Ensure the common parts of the two structs are identical. */
5047                 SANITY_CHECK_MEM_REGION_FIELD(slot);
5048                 SANITY_CHECK_MEM_REGION_FIELD(flags);
5049                 SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
5050                 SANITY_CHECK_MEM_REGION_FIELD(memory_size);
5051                 SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
5052
5053                 r = -EFAULT;
5054                 if (copy_from_user(&mem, argp, size))
5055                         goto out;
5056
5057                 r = -EINVAL;
5058                 if (ioctl == KVM_SET_USER_MEMORY_REGION &&
5059                     (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
5060                         goto out;
5061
5062                 r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
5063                 break;
5064         }
5065         case KVM_GET_DIRTY_LOG: {
5066                 struct kvm_dirty_log log;
5067
5068                 r = -EFAULT;
5069                 if (copy_from_user(&log, argp, sizeof(log)))
5070                         goto out;
5071                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5072                 break;
5073         }
5074 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5075         case KVM_CLEAR_DIRTY_LOG: {
5076                 struct kvm_clear_dirty_log log;
5077
5078                 r = -EFAULT;
5079                 if (copy_from_user(&log, argp, sizeof(log)))
5080                         goto out;
5081                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5082                 break;
5083         }
5084 #endif
5085 #ifdef CONFIG_KVM_MMIO
5086         case KVM_REGISTER_COALESCED_MMIO: {
5087                 struct kvm_coalesced_mmio_zone zone;
5088
5089                 r = -EFAULT;
5090                 if (copy_from_user(&zone, argp, sizeof(zone)))
5091                         goto out;
5092                 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
5093                 break;
5094         }
5095         case KVM_UNREGISTER_COALESCED_MMIO: {
5096                 struct kvm_coalesced_mmio_zone zone;
5097
5098                 r = -EFAULT;
5099                 if (copy_from_user(&zone, argp, sizeof(zone)))
5100                         goto out;
5101                 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
5102                 break;
5103         }
5104 #endif
5105         case KVM_IRQFD: {
5106                 struct kvm_irqfd data;
5107
5108                 r = -EFAULT;
5109                 if (copy_from_user(&data, argp, sizeof(data)))
5110                         goto out;
5111                 r = kvm_irqfd(kvm, &data);
5112                 break;
5113         }
5114         case KVM_IOEVENTFD: {
5115                 struct kvm_ioeventfd data;
5116
5117                 r = -EFAULT;
5118                 if (copy_from_user(&data, argp, sizeof(data)))
5119                         goto out;
5120                 r = kvm_ioeventfd(kvm, &data);
5121                 break;
5122         }
5123 #ifdef CONFIG_HAVE_KVM_MSI
5124         case KVM_SIGNAL_MSI: {
5125                 struct kvm_msi msi;
5126
5127                 r = -EFAULT;
5128                 if (copy_from_user(&msi, argp, sizeof(msi)))
5129                         goto out;
5130                 r = kvm_send_userspace_msi(kvm, &msi);
5131                 break;
5132         }
5133 #endif
5134 #ifdef __KVM_HAVE_IRQ_LINE
5135         case KVM_IRQ_LINE_STATUS:
5136         case KVM_IRQ_LINE: {
5137                 struct kvm_irq_level irq_event;
5138
5139                 r = -EFAULT;
5140                 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
5141                         goto out;
5142
5143                 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
5144                                         ioctl == KVM_IRQ_LINE_STATUS);
5145                 if (r)
5146                         goto out;
5147
5148                 r = -EFAULT;
5149                 if (ioctl == KVM_IRQ_LINE_STATUS) {
5150                         if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
5151                                 goto out;
5152                 }
5153
5154                 r = 0;
5155                 break;
5156         }
5157 #endif
5158 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
5159         case KVM_SET_GSI_ROUTING: {
5160                 struct kvm_irq_routing routing;
5161                 struct kvm_irq_routing __user *urouting;
5162                 struct kvm_irq_routing_entry *entries = NULL;
5163
5164                 r = -EFAULT;
5165                 if (copy_from_user(&routing, argp, sizeof(routing)))
5166                         goto out;
5167                 r = -EINVAL;
5168                 if (!kvm_arch_can_set_irq_routing(kvm))
5169                         goto out;
5170                 if (routing.nr > KVM_MAX_IRQ_ROUTES)
5171                         goto out;
5172                 if (routing.flags)
5173                         goto out;
5174                 if (routing.nr) {
5175                         urouting = argp;
5176                         entries = vmemdup_array_user(urouting->entries,
5177                                                      routing.nr, sizeof(*entries));
5178                         if (IS_ERR(entries)) {
5179                                 r = PTR_ERR(entries);
5180                                 goto out;
5181                         }
5182                 }
5183                 r = kvm_set_irq_routing(kvm, entries, routing.nr,
5184                                         routing.flags);
5185                 kvfree(entries);
5186                 break;
5187         }
5188 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
5189 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
5190         case KVM_SET_MEMORY_ATTRIBUTES: {
5191                 struct kvm_memory_attributes attrs;
5192
5193                 r = -EFAULT;
5194                 if (copy_from_user(&attrs, argp, sizeof(attrs)))
5195                         goto out;
5196
5197                 r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
5198                 break;
5199         }
5200 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
5201         case KVM_CREATE_DEVICE: {
5202                 struct kvm_create_device cd;
5203
5204                 r = -EFAULT;
5205                 if (copy_from_user(&cd, argp, sizeof(cd)))
5206                         goto out;
5207
5208                 r = kvm_ioctl_create_device(kvm, &cd);
5209                 if (r)
5210                         goto out;
5211
5212                 r = -EFAULT;
5213                 if (copy_to_user(argp, &cd, sizeof(cd)))
5214                         goto out;
5215
5216                 r = 0;
5217                 break;
5218         }
5219         case KVM_CHECK_EXTENSION:
5220                 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
5221                 break;
5222         case KVM_RESET_DIRTY_RINGS:
5223                 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
5224                 break;
5225         case KVM_GET_STATS_FD:
5226                 r = kvm_vm_ioctl_get_stats_fd(kvm);
5227                 break;
5228 #ifdef CONFIG_KVM_PRIVATE_MEM
5229         case KVM_CREATE_GUEST_MEMFD: {
5230                 struct kvm_create_guest_memfd guest_memfd;
5231
5232                 r = -EFAULT;
5233                 if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
5234                         goto out;
5235
5236                 r = kvm_gmem_create(kvm, &guest_memfd);
5237                 break;
5238         }
5239 #endif
5240         default:
5241                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
5242         }
5243 out:
5244         return r;
5245 }
5246
5247 #ifdef CONFIG_KVM_COMPAT
5248 struct compat_kvm_dirty_log {
5249         __u32 slot;
5250         __u32 padding1;
5251         union {
5252                 compat_uptr_t dirty_bitmap; /* one bit per page */
5253                 __u64 padding2;
5254         };
5255 };
5256
5257 struct compat_kvm_clear_dirty_log {
5258         __u32 slot;
5259         __u32 num_pages;
5260         __u64 first_page;
5261         union {
5262                 compat_uptr_t dirty_bitmap; /* one bit per page */
5263                 __u64 padding2;
5264         };
5265 };
5266
5267 long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5268                                      unsigned long arg)
5269 {
5270         return -ENOTTY;
5271 }
5272
5273 static long kvm_vm_compat_ioctl(struct file *filp,
5274                            unsigned int ioctl, unsigned long arg)
5275 {
5276         struct kvm *kvm = filp->private_data;
5277         int r;
5278
5279         if (kvm->mm != current->mm || kvm->vm_dead)
5280                 return -EIO;
5281
5282         r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5283         if (r != -ENOTTY)
5284                 return r;
5285
5286         switch (ioctl) {
5287 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5288         case KVM_CLEAR_DIRTY_LOG: {
5289                 struct compat_kvm_clear_dirty_log compat_log;
5290                 struct kvm_clear_dirty_log log;
5291
5292                 if (copy_from_user(&compat_log, (void __user *)arg,
5293                                    sizeof(compat_log)))
5294                         return -EFAULT;
5295                 log.slot         = compat_log.slot;
5296                 log.num_pages    = compat_log.num_pages;
5297                 log.first_page   = compat_log.first_page;
5298                 log.padding2     = compat_log.padding2;
5299                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5300
5301                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
5302                 break;
5303         }
5304 #endif
5305         case KVM_GET_DIRTY_LOG: {
5306                 struct compat_kvm_dirty_log compat_log;
5307                 struct kvm_dirty_log log;
5308
5309                 if (copy_from_user(&compat_log, (void __user *)arg,
5310                                    sizeof(compat_log)))
5311                         return -EFAULT;
5312                 log.slot         = compat_log.slot;
5313                 log.padding1     = compat_log.padding1;
5314                 log.padding2     = compat_log.padding2;
5315                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
5316
5317                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
5318                 break;
5319         }
5320         default:
5321                 r = kvm_vm_ioctl(filp, ioctl, arg);
5322         }
5323         return r;
5324 }
5325 #endif
5326
5327 static struct file_operations kvm_vm_fops = {
5328         .release        = kvm_vm_release,
5329         .unlocked_ioctl = kvm_vm_ioctl,
5330         .llseek         = noop_llseek,
5331         KVM_COMPAT(kvm_vm_compat_ioctl),
5332 };
5333
5334 bool file_is_kvm(struct file *file)
5335 {
5336         return file && file->f_op == &kvm_vm_fops;
5337 }
5338 EXPORT_SYMBOL_GPL(file_is_kvm);
5339
5340 static int kvm_dev_ioctl_create_vm(unsigned long type)
5341 {
5342         char fdname[ITOA_MAX_LEN + 1];
5343         int r, fd;
5344         struct kvm *kvm;
5345         struct file *file;
5346
5347         fd = get_unused_fd_flags(O_CLOEXEC);
5348         if (fd < 0)
5349                 return fd;
5350
5351         snprintf(fdname, sizeof(fdname), "%d", fd);
5352
5353         kvm = kvm_create_vm(type, fdname);
5354         if (IS_ERR(kvm)) {
5355                 r = PTR_ERR(kvm);
5356                 goto put_fd;
5357         }
5358
5359         file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
5360         if (IS_ERR(file)) {
5361                 r = PTR_ERR(file);
5362                 goto put_kvm;
5363         }
5364
5365         /*
5366          * Don't call kvm_put_kvm anymore at this point; file->f_op is
5367          * already set, with ->release() being kvm_vm_release().  In error
5368          * cases it will be called by the final fput(file) and will take
5369          * care of doing kvm_put_kvm(kvm).
5370          */
5371         kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
5372
5373         fd_install(fd, file);
5374         return fd;
5375
5376 put_kvm:
5377         kvm_put_kvm(kvm);
5378 put_fd:
5379         put_unused_fd(fd);
5380         return r;
5381 }
5382
5383 static long kvm_dev_ioctl(struct file *filp,
5384                           unsigned int ioctl, unsigned long arg)
5385 {
5386         int r = -EINVAL;
5387
5388         switch (ioctl) {
5389         case KVM_GET_API_VERSION:
5390                 if (arg)
5391                         goto out;
5392                 r = KVM_API_VERSION;
5393                 break;
5394         case KVM_CREATE_VM:
5395                 r = kvm_dev_ioctl_create_vm(arg);
5396                 break;
5397         case KVM_CHECK_EXTENSION:
5398                 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5399                 break;
5400         case KVM_GET_VCPU_MMAP_SIZE:
5401                 if (arg)
5402                         goto out;
5403                 r = PAGE_SIZE;     /* struct kvm_run */
5404 #ifdef CONFIG_X86
5405                 r += PAGE_SIZE;    /* pio data page */
5406 #endif
5407 #ifdef CONFIG_KVM_MMIO
5408                 r += PAGE_SIZE;    /* coalesced mmio ring page */
5409 #endif
5410                 break;
5411         default:
5412                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
5413         }
5414 out:
5415         return r;
5416 }
5417
5418 static struct file_operations kvm_chardev_ops = {
5419         .unlocked_ioctl = kvm_dev_ioctl,
5420         .llseek         = noop_llseek,
5421         KVM_COMPAT(kvm_dev_ioctl),
5422 };
5423
5424 static struct miscdevice kvm_dev = {
5425         KVM_MINOR,
5426         "kvm",
5427         &kvm_chardev_ops,
5428 };
5429
5430 #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5431 static bool enable_virt_at_load = true;
5432 module_param(enable_virt_at_load, bool, 0444);
5433
5434 __visible bool kvm_rebooting;
5435 EXPORT_SYMBOL_GPL(kvm_rebooting);
5436
5437 static DEFINE_PER_CPU(bool, virtualization_enabled);
5438 static DEFINE_MUTEX(kvm_usage_lock);
5439 static int kvm_usage_count;
5440
5441 __weak void kvm_arch_enable_virtualization(void)
5442 {
5443
5444 }
5445
5446 __weak void kvm_arch_disable_virtualization(void)
5447 {
5448
5449 }
5450
5451 static int kvm_enable_virtualization_cpu(void)
5452 {
5453         if (__this_cpu_read(virtualization_enabled))
5454                 return 0;
5455
5456         if (kvm_arch_enable_virtualization_cpu()) {
5457                 pr_info("kvm: enabling virtualization on CPU%d failed\n",
5458                         raw_smp_processor_id());
5459                 return -EIO;
5460         }
5461
5462         __this_cpu_write(virtualization_enabled, true);
5463         return 0;
5464 }
5465
5466 static int kvm_online_cpu(unsigned int cpu)
5467 {
5468         /*
5469          * Abort the CPU online process if hardware virtualization cannot
5470          * be enabled. Otherwise running VMs would encounter unrecoverable
5471          * errors when scheduled to this CPU.
5472          */
5473         return kvm_enable_virtualization_cpu();
5474 }
5475
5476 static void kvm_disable_virtualization_cpu(void *ign)
5477 {
5478         if (!__this_cpu_read(virtualization_enabled))
5479                 return;
5480
5481         kvm_arch_disable_virtualization_cpu();
5482
5483         __this_cpu_write(virtualization_enabled, false);
5484 }
5485
5486 static int kvm_offline_cpu(unsigned int cpu)
5487 {
5488         kvm_disable_virtualization_cpu(NULL);
5489         return 0;
5490 }
5491
5492 static void kvm_shutdown(void)
5493 {
5494         /*
5495          * Disable hardware virtualization and set kvm_rebooting to indicate
5496          * that KVM has asynchronously disabled hardware virtualization, i.e.
5497          * that relevant errors and exceptions aren't entirely unexpected.
5498          * Some flavors of hardware virtualization need to be disabled before
5499          * transferring control to firmware (to perform shutdown/reboot), e.g.
5500          * on x86, virtualization can block INIT interrupts, which are used by
5501          * firmware to pull APs back under firmware control.  Note, this path
5502          * is used for both shutdown and reboot scenarios, i.e. neither name is
5503          * 100% comprehensive.
5504          */
5505         pr_info("kvm: exiting hardware virtualization\n");
5506         kvm_rebooting = true;
5507         on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
5508 }
5509
5510 static int kvm_suspend(void)
5511 {
5512         /*
5513          * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5514          * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
5515          * count is stable.  Assert that kvm_usage_lock is not held to ensure
5516          * the system isn't suspended while KVM is enabling hardware.  Hardware
5517          * enabling can be preempted, but the task cannot be frozen until it has
5518          * dropped all locks (userspace tasks are frozen via a fake signal).
5519          */
5520         lockdep_assert_not_held(&kvm_usage_lock);
5521         lockdep_assert_irqs_disabled();
5522
5523         kvm_disable_virtualization_cpu(NULL);
5524         return 0;
5525 }
5526
5527 static void kvm_resume(void)
5528 {
5529         lockdep_assert_not_held(&kvm_usage_lock);
5530         lockdep_assert_irqs_disabled();
5531
5532         WARN_ON_ONCE(kvm_enable_virtualization_cpu());
5533 }
5534
5535 static struct syscore_ops kvm_syscore_ops = {
5536         .suspend = kvm_suspend,
5537         .resume = kvm_resume,
5538         .shutdown = kvm_shutdown,
5539 };
5540
5541 static int kvm_enable_virtualization(void)
5542 {
5543         int r;
5544
5545         guard(mutex)(&kvm_usage_lock);
5546
5547         if (kvm_usage_count++)
5548                 return 0;
5549
5550         kvm_arch_enable_virtualization();
5551
5552         r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
5553                               kvm_online_cpu, kvm_offline_cpu);
5554         if (r)
5555                 goto err_cpuhp;
5556
5557         register_syscore_ops(&kvm_syscore_ops);
5558
5559         /*
5560          * Undo virtualization enabling and bail if the system is going down.
5561          * If userspace initiated a forced reboot, e.g. reboot -f, then it's
5562          * possible for an in-flight operation to enable virtualization after
5563          * syscore_shutdown() is called, i.e. without kvm_shutdown() being
5564          * invoked.  Note, this relies on system_state being set _before_
5565          * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
5566          * or this CPU observes the impending shutdown.  Which is why KVM uses
5567          * a syscore ops hook instead of registering a dedicated reboot
5568          * notifier (the latter runs before system_state is updated).
5569          */
5570         if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
5571             system_state == SYSTEM_RESTART) {
5572                 r = -EBUSY;
5573                 goto err_rebooting;
5574         }
5575
5576         return 0;
5577
5578 err_rebooting:
5579         unregister_syscore_ops(&kvm_syscore_ops);
5580         cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
5581 err_cpuhp:
5582         kvm_arch_disable_virtualization();
5583         --kvm_usage_count;
5584         return r;
5585 }
5586
5587 static void kvm_disable_virtualization(void)
5588 {
5589         guard(mutex)(&kvm_usage_lock);
5590
5591         if (--kvm_usage_count)
5592                 return;
5593
5594         unregister_syscore_ops(&kvm_syscore_ops);
5595         cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
5596         kvm_arch_disable_virtualization();
5597 }
5598
5599 static int kvm_init_virtualization(void)
5600 {
5601         if (enable_virt_at_load)
5602                 return kvm_enable_virtualization();
5603
5604         return 0;
5605 }
5606
5607 static void kvm_uninit_virtualization(void)
5608 {
5609         if (enable_virt_at_load)
5610                 kvm_disable_virtualization();
5611 }
5612 #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5613 static int kvm_enable_virtualization(void)
5614 {
5615         return 0;
5616 }
5617
5618 static int kvm_init_virtualization(void)
5619 {
5620         return 0;
5621 }
5622
5623 static void kvm_disable_virtualization(void)
5624 {
5625
5626 }
5627
5628 static void kvm_uninit_virtualization(void)
5629 {
5630
5631 }
5632 #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5633
5634 static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5635 {
5636         if (dev->ops->destructor)
5637                 dev->ops->destructor(dev);
5638 }
5639
5640 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
5641 {
5642         int i;
5643
5644         for (i = 0; i < bus->dev_count; i++) {
5645                 struct kvm_io_device *pos = bus->range[i].dev;
5646
5647                 kvm_iodevice_destructor(pos);
5648         }
5649         kfree(bus);
5650 }
5651
5652 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5653                                  const struct kvm_io_range *r2)
5654 {
5655         gpa_t addr1 = r1->addr;
5656         gpa_t addr2 = r2->addr;
5657
5658         if (addr1 < addr2)
5659                 return -1;
5660
5661         /* If r2->len == 0, match the exact address.  If r2->len != 0,
5662          * accept any overlapping write.  Any order is acceptable for
5663          * overlapping ranges, because kvm_io_bus_get_first_dev ensures
5664          * we process all of them.
5665          */
5666         if (r2->len) {
5667                 addr1 += r1->len;
5668                 addr2 += r2->len;
5669         }
5670
5671         if (addr1 > addr2)
5672                 return 1;
5673
5674         return 0;
5675 }
5676
5677 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
5678 {
5679         return kvm_io_bus_cmp(p1, p2);
5680 }
5681
5682 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5683                              gpa_t addr, int len)
5684 {
5685         struct kvm_io_range *range, key;
5686         int off;
5687
5688         key = (struct kvm_io_range) {
5689                 .addr = addr,
5690                 .len = len,
5691         };
5692
5693         range = bsearch(&key, bus->range, bus->dev_count,
5694                         sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
5695         if (range == NULL)
5696                 return -ENOENT;
5697
5698         off = range - bus->range;
5699
5700         while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
5701                 off--;
5702
5703         return off;
5704 }
5705
5706 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5707                               struct kvm_io_range *range, const void *val)
5708 {
5709         int idx;
5710
5711         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5712         if (idx < 0)
5713                 return -EOPNOTSUPP;
5714
5715         while (idx < bus->dev_count &&
5716                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5717                 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
5718                                         range->len, val))
5719                         return idx;
5720                 idx++;
5721         }
5722
5723         return -EOPNOTSUPP;
5724 }
5725
5726 /* kvm_io_bus_write - called under kvm->slots_lock */
5727 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5728                      int len, const void *val)
5729 {
5730         struct kvm_io_bus *bus;
5731         struct kvm_io_range range;
5732         int r;
5733
5734         range = (struct kvm_io_range) {
5735                 .addr = addr,
5736                 .len = len,
5737         };
5738
5739         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5740         if (!bus)
5741                 return -ENOMEM;
5742         r = __kvm_io_bus_write(vcpu, bus, &range, val);
5743         return r < 0 ? r : 0;
5744 }
5745 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5746
5747 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
5748 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
5749                             gpa_t addr, int len, const void *val, long cookie)
5750 {
5751         struct kvm_io_bus *bus;
5752         struct kvm_io_range range;
5753
5754         range = (struct kvm_io_range) {
5755                 .addr = addr,
5756                 .len = len,
5757         };
5758
5759         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5760         if (!bus)
5761                 return -ENOMEM;
5762
5763         /* First try the device referenced by cookie. */
5764         if ((cookie >= 0) && (cookie < bus->dev_count) &&
5765             (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
5766                 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
5767                                         val))
5768                         return cookie;
5769
5770         /*
5771          * cookie contained garbage; fall back to search and return the
5772          * correct cookie value.
5773          */
5774         return __kvm_io_bus_write(vcpu, bus, &range, val);
5775 }
5776
5777 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
5778                              struct kvm_io_range *range, void *val)
5779 {
5780         int idx;
5781
5782         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
5783         if (idx < 0)
5784                 return -EOPNOTSUPP;
5785
5786         while (idx < bus->dev_count &&
5787                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
5788                 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
5789                                        range->len, val))
5790                         return idx;
5791                 idx++;
5792         }
5793
5794         return -EOPNOTSUPP;
5795 }
5796
5797 /* kvm_io_bus_read - called under kvm->slots_lock */
5798 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5799                     int len, void *val)
5800 {
5801         struct kvm_io_bus *bus;
5802         struct kvm_io_range range;
5803         int r;
5804
5805         range = (struct kvm_io_range) {
5806                 .addr = addr,
5807                 .len = len,
5808         };
5809
5810         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5811         if (!bus)
5812                 return -ENOMEM;
5813         r = __kvm_io_bus_read(vcpu, bus, &range, val);
5814         return r < 0 ? r : 0;
5815 }
5816
5817 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5818                             int len, struct kvm_io_device *dev)
5819 {
5820         int i;
5821         struct kvm_io_bus *new_bus, *bus;
5822         struct kvm_io_range range;
5823
5824         lockdep_assert_held(&kvm->slots_lock);
5825
5826         bus = kvm_get_bus(kvm, bus_idx);
5827         if (!bus)
5828                 return -ENOMEM;
5829
5830         /* exclude ioeventfd which is limited by maximum fd */
5831         if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5832                 return -ENOSPC;
5833
5834         new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5835                           GFP_KERNEL_ACCOUNT);
5836         if (!new_bus)
5837                 return -ENOMEM;
5838
5839         range = (struct kvm_io_range) {
5840                 .addr = addr,
5841                 .len = len,
5842                 .dev = dev,
5843         };
5844
5845         for (i = 0; i < bus->dev_count; i++)
5846                 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5847                         break;
5848
5849         memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5850         new_bus->dev_count++;
5851         new_bus->range[i] = range;
5852         memcpy(new_bus->range + i + 1, bus->range + i,
5853                 (bus->dev_count - i) * sizeof(struct kvm_io_range));
5854         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5855         synchronize_srcu_expedited(&kvm->srcu);
5856         kfree(bus);
5857
5858         return 0;
5859 }
5860
5861 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5862                               struct kvm_io_device *dev)
5863 {
5864         int i;
5865         struct kvm_io_bus *new_bus, *bus;
5866
5867         lockdep_assert_held(&kvm->slots_lock);
5868
5869         bus = kvm_get_bus(kvm, bus_idx);
5870         if (!bus)
5871                 return 0;
5872
5873         for (i = 0; i < bus->dev_count; i++) {
5874                 if (bus->range[i].dev == dev) {
5875                         break;
5876                 }
5877         }
5878
5879         if (i == bus->dev_count)
5880                 return 0;
5881
5882         new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5883                           GFP_KERNEL_ACCOUNT);
5884         if (new_bus) {
5885                 memcpy(new_bus, bus, struct_size(bus, range, i));
5886                 new_bus->dev_count--;
5887                 memcpy(new_bus->range + i, bus->range + i + 1,
5888                                 flex_array_size(new_bus, range, new_bus->dev_count - i));
5889         }
5890
5891         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5892         synchronize_srcu_expedited(&kvm->srcu);
5893
5894         /*
5895          * If NULL bus is installed, destroy the old bus, including all the
5896          * attached devices. Otherwise, destroy the caller's device only.
5897          */
5898         if (!new_bus) {
5899                 pr_err("kvm: failed to shrink bus, removing it completely\n");
5900                 kvm_io_bus_destroy(bus);
5901                 return -ENOMEM;
5902         }
5903
5904         kvm_iodevice_destructor(dev);
5905         kfree(bus);
5906         return 0;
5907 }
5908
5909 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5910                                          gpa_t addr)
5911 {
5912         struct kvm_io_bus *bus;
5913         int dev_idx, srcu_idx;
5914         struct kvm_io_device *iodev = NULL;
5915
5916         srcu_idx = srcu_read_lock(&kvm->srcu);
5917
5918         bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5919         if (!bus)
5920                 goto out_unlock;
5921
5922         dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5923         if (dev_idx < 0)
5924                 goto out_unlock;
5925
5926         iodev = bus->range[dev_idx].dev;
5927
5928 out_unlock:
5929         srcu_read_unlock(&kvm->srcu, srcu_idx);
5930
5931         return iodev;
5932 }
5933 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5934
5935 static int kvm_debugfs_open(struct inode *inode, struct file *file,
5936                            int (*get)(void *, u64 *), int (*set)(void *, u64),
5937                            const char *fmt)
5938 {
5939         int ret;
5940         struct kvm_stat_data *stat_data = inode->i_private;
5941
5942         /*
5943          * The debugfs files are a reference to the kvm struct which
5944         * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5945         * avoids the race between open and the removal of the debugfs directory.
5946          */
5947         if (!kvm_get_kvm_safe(stat_data->kvm))
5948                 return -ENOENT;
5949
5950         ret = simple_attr_open(inode, file, get,
5951                                kvm_stats_debugfs_mode(stat_data->desc) & 0222
5952                                ? set : NULL, fmt);
5953         if (ret)
5954                 kvm_put_kvm(stat_data->kvm);
5955
5956         return ret;
5957 }
5958
5959 static int kvm_debugfs_release(struct inode *inode, struct file *file)
5960 {
5961         struct kvm_stat_data *stat_data = inode->i_private;
5962
5963         simple_attr_release(inode, file);
5964         kvm_put_kvm(stat_data->kvm);
5965
5966         return 0;
5967 }
5968
5969 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5970 {
5971         *val = *(u64 *)((void *)(&kvm->stat) + offset);
5972
5973         return 0;
5974 }
5975
5976 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5977 {
5978         *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5979
5980         return 0;
5981 }
5982
5983 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5984 {
5985         unsigned long i;
5986         struct kvm_vcpu *vcpu;
5987
5988         *val = 0;
5989
5990         kvm_for_each_vcpu(i, vcpu, kvm)
5991                 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5992
5993         return 0;
5994 }
5995
5996 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5997 {
5998         unsigned long i;
5999         struct kvm_vcpu *vcpu;
6000
6001         kvm_for_each_vcpu(i, vcpu, kvm)
6002                 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
6003
6004         return 0;
6005 }
6006
6007 static int kvm_stat_data_get(void *data, u64 *val)
6008 {
6009         int r = -EFAULT;
6010         struct kvm_stat_data *stat_data = data;
6011
6012         switch (stat_data->kind) {
6013         case KVM_STAT_VM:
6014                 r = kvm_get_stat_per_vm(stat_data->kvm,
6015                                         stat_data->desc->desc.offset, val);
6016                 break;
6017         case KVM_STAT_VCPU:
6018                 r = kvm_get_stat_per_vcpu(stat_data->kvm,
6019                                           stat_data->desc->desc.offset, val);
6020                 break;
6021         }
6022
6023         return r;
6024 }
6025
6026 static int kvm_stat_data_clear(void *data, u64 val)
6027 {
6028         int r = -EFAULT;
6029         struct kvm_stat_data *stat_data = data;
6030
6031         if (val)
6032                 return -EINVAL;
6033
6034         switch (stat_data->kind) {
6035         case KVM_STAT_VM:
6036                 r = kvm_clear_stat_per_vm(stat_data->kvm,
6037                                           stat_data->desc->desc.offset);
6038                 break;
6039         case KVM_STAT_VCPU:
6040                 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
6041                                             stat_data->desc->desc.offset);
6042                 break;
6043         }
6044
6045         return r;
6046 }
6047
6048 static int kvm_stat_data_open(struct inode *inode, struct file *file)
6049 {
6050         __simple_attr_check_format("%llu\n", 0ull);
6051         return kvm_debugfs_open(inode, file, kvm_stat_data_get,
6052                                 kvm_stat_data_clear, "%llu\n");
6053 }
6054
6055 static const struct file_operations stat_fops_per_vm = {
6056         .owner = THIS_MODULE,
6057         .open = kvm_stat_data_open,
6058         .release = kvm_debugfs_release,
6059         .read = simple_attr_read,
6060         .write = simple_attr_write,
6061 };
6062
6063 static int vm_stat_get(void *_offset, u64 *val)
6064 {
6065         unsigned offset = (long)_offset;
6066         struct kvm *kvm;
6067         u64 tmp_val;
6068
6069         *val = 0;
6070         mutex_lock(&kvm_lock);
6071         list_for_each_entry(kvm, &vm_list, vm_list) {
6072                 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
6073                 *val += tmp_val;
6074         }
6075         mutex_unlock(&kvm_lock);
6076         return 0;
6077 }
6078
6079 static int vm_stat_clear(void *_offset, u64 val)
6080 {
6081         unsigned offset = (long)_offset;
6082         struct kvm *kvm;
6083
6084         if (val)
6085                 return -EINVAL;
6086
6087         mutex_lock(&kvm_lock);
6088         list_for_each_entry(kvm, &vm_list, vm_list) {
6089                 kvm_clear_stat_per_vm(kvm, offset);
6090         }
6091         mutex_unlock(&kvm_lock);
6092
6093         return 0;
6094 }
6095
6096 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
6097 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
6098
6099 static int vcpu_stat_get(void *_offset, u64 *val)
6100 {
6101         unsigned offset = (long)_offset;
6102         struct kvm *kvm;
6103         u64 tmp_val;
6104
6105         *val = 0;
6106         mutex_lock(&kvm_lock);
6107         list_for_each_entry(kvm, &vm_list, vm_list) {
6108                 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
6109                 *val += tmp_val;
6110         }
6111         mutex_unlock(&kvm_lock);
6112         return 0;
6113 }
6114
6115 static int vcpu_stat_clear(void *_offset, u64 val)
6116 {
6117         unsigned offset = (long)_offset;
6118         struct kvm *kvm;
6119
6120         if (val)
6121                 return -EINVAL;
6122
6123         mutex_lock(&kvm_lock);
6124         list_for_each_entry(kvm, &vm_list, vm_list) {
6125                 kvm_clear_stat_per_vcpu(kvm, offset);
6126         }
6127         mutex_unlock(&kvm_lock);
6128
6129         return 0;
6130 }
6131
6132 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
6133                         "%llu\n");
6134 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
6135
6136 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
6137 {
6138         struct kobj_uevent_env *env;
6139         unsigned long long created, active;
6140
6141         if (!kvm_dev.this_device || !kvm)
6142                 return;
6143
6144         mutex_lock(&kvm_lock);
6145         if (type == KVM_EVENT_CREATE_VM) {
6146                 kvm_createvm_count++;
6147                 kvm_active_vms++;
6148         } else if (type == KVM_EVENT_DESTROY_VM) {
6149                 kvm_active_vms--;
6150         }
6151         created = kvm_createvm_count;
6152         active = kvm_active_vms;
6153         mutex_unlock(&kvm_lock);
6154
6155         env = kzalloc(sizeof(*env), GFP_KERNEL);
6156         if (!env)
6157                 return;
6158
6159         add_uevent_var(env, "CREATED=%llu", created);
6160         add_uevent_var(env, "COUNT=%llu", active);
6161
6162         if (type == KVM_EVENT_CREATE_VM) {
6163                 add_uevent_var(env, "EVENT=create");
6164                 kvm->userspace_pid = task_pid_nr(current);
6165         } else if (type == KVM_EVENT_DESTROY_VM) {
6166                 add_uevent_var(env, "EVENT=destroy");
6167         }
6168         add_uevent_var(env, "PID=%d", kvm->userspace_pid);
6169
6170         if (!IS_ERR(kvm->debugfs_dentry)) {
6171                 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
6172
6173                 if (p) {
6174                         tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
6175                         if (!IS_ERR(tmp))
6176                                 add_uevent_var(env, "STATS_PATH=%s", tmp);
6177                         kfree(p);
6178                 }
6179         }
6180         /* no need for checks, since we are adding at most only 5 keys */
6181         env->envp[env->envp_idx++] = NULL;
6182         kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
6183         kfree(env);
6184 }
6185
6186 static void kvm_init_debug(void)
6187 {
6188         const struct file_operations *fops;
6189         const struct _kvm_stats_desc *pdesc;
6190         int i;
6191
6192         kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
6193
6194         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
6195                 pdesc = &kvm_vm_stats_desc[i];
6196                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6197                         fops = &vm_stat_fops;
6198                 else
6199                         fops = &vm_stat_readonly_fops;
6200                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6201                                 kvm_debugfs_dir,
6202                                 (void *)(long)pdesc->desc.offset, fops);
6203         }
6204
6205         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
6206                 pdesc = &kvm_vcpu_stats_desc[i];
6207                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
6208                         fops = &vcpu_stat_fops;
6209                 else
6210                         fops = &vcpu_stat_readonly_fops;
6211                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
6212                                 kvm_debugfs_dir,
6213                                 (void *)(long)pdesc->desc.offset, fops);
6214         }
6215 }
6216
6217 static inline
6218 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
6219 {
6220         return container_of(pn, struct kvm_vcpu, preempt_notifier);
6221 }
6222
6223 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
6224 {
6225         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6226
6227         WRITE_ONCE(vcpu->preempted, false);
6228         WRITE_ONCE(vcpu->ready, false);
6229
6230         __this_cpu_write(kvm_running_vcpu, vcpu);
6231         kvm_arch_vcpu_load(vcpu, cpu);
6232
6233         WRITE_ONCE(vcpu->scheduled_out, false);
6234 }
6235
6236 static void kvm_sched_out(struct preempt_notifier *pn,
6237                           struct task_struct *next)
6238 {
6239         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
6240
6241         WRITE_ONCE(vcpu->scheduled_out, true);
6242
6243         if (task_is_runnable(current) && vcpu->wants_to_run) {
6244                 WRITE_ONCE(vcpu->preempted, true);
6245                 WRITE_ONCE(vcpu->ready, true);
6246         }
6247         kvm_arch_vcpu_put(vcpu);
6248         __this_cpu_write(kvm_running_vcpu, NULL);
6249 }
6250
6251 /**
6252  * kvm_get_running_vcpu - get the vcpu running on the current CPU.
6253  *
6254  * We can disable preemption locally around accessing the per-CPU variable,
6255  * and use the resolved vcpu pointer after enabling preemption again,
6256  * because even if the current thread is migrated to another CPU, reading
6257  * the per-CPU value later will give us the same value as we update the
6258  * per-CPU variable in the preempt notifier handlers.
6259  */
6260 struct kvm_vcpu *kvm_get_running_vcpu(void)
6261 {
6262         struct kvm_vcpu *vcpu;
6263
6264         preempt_disable();
6265         vcpu = __this_cpu_read(kvm_running_vcpu);
6266         preempt_enable();
6267
6268         return vcpu;
6269 }
6270 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
6271
6272 /**
6273  * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6274  */
6275 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
6276 {
6277         return &kvm_running_vcpu;
6278 }
6279
6280 #ifdef CONFIG_GUEST_PERF_EVENTS
6281 static unsigned int kvm_guest_state(void)
6282 {
6283         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6284         unsigned int state;
6285
6286         if (!kvm_arch_pmi_in_guest(vcpu))
6287                 return 0;
6288
6289         state = PERF_GUEST_ACTIVE;
6290         if (!kvm_arch_vcpu_in_kernel(vcpu))
6291                 state |= PERF_GUEST_USER;
6292
6293         return state;
6294 }
6295
6296 static unsigned long kvm_guest_get_ip(void)
6297 {
6298         struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6299
6300         /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
6301         if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6302                 return 0;
6303
6304         return kvm_arch_vcpu_get_ip(vcpu);
6305 }
6306
6307 static struct perf_guest_info_callbacks kvm_guest_cbs = {
6308         .state                  = kvm_guest_state,
6309         .get_ip                 = kvm_guest_get_ip,
6310         .handle_intel_pt_intr   = NULL,
6311 };
6312
6313 void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
6314 {
6315         kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6316         perf_register_guest_info_callbacks(&kvm_guest_cbs);
6317 }
6318 void kvm_unregister_perf_callbacks(void)
6319 {
6320         perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
6321 }
6322 #endif
6323
6324 int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6325 {
6326         int r;
6327         int cpu;
6328
6329         /* A kmem cache lets us meet the alignment requirements of fx_save. */
6330         if (!vcpu_align)
6331                 vcpu_align = __alignof__(struct kvm_vcpu);
6332         kvm_vcpu_cache =
6333                 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
6334                                            SLAB_ACCOUNT,
6335                                            offsetof(struct kvm_vcpu, arch),
6336                                            offsetofend(struct kvm_vcpu, stats_id)
6337                                            - offsetof(struct kvm_vcpu, arch),
6338                                            NULL);
6339         if (!kvm_vcpu_cache)
6340                 return -ENOMEM;
6341
6342         for_each_possible_cpu(cpu) {
6343                 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
6344                                             GFP_KERNEL, cpu_to_node(cpu))) {
6345                         r = -ENOMEM;
6346                         goto err_cpu_kick_mask;
6347                 }
6348         }
6349
6350         r = kvm_irqfd_init();
6351         if (r)
6352                 goto err_irqfd;
6353
6354         r = kvm_async_pf_init();
6355         if (r)
6356                 goto err_async_pf;
6357
6358         kvm_chardev_ops.owner = module;
6359         kvm_vm_fops.owner = module;
6360         kvm_vcpu_fops.owner = module;
6361         kvm_device_fops.owner = module;
6362
6363         kvm_preempt_ops.sched_in = kvm_sched_in;
6364         kvm_preempt_ops.sched_out = kvm_sched_out;
6365
6366         kvm_init_debug();
6367
6368         r = kvm_vfio_ops_init();
6369         if (WARN_ON_ONCE(r))
6370                 goto err_vfio;
6371
6372         kvm_gmem_init(module);
6373
6374         r = kvm_init_virtualization();
6375         if (r)
6376                 goto err_virt;
6377
6378         /*
6379          * Registration _must_ be the very last thing done, as this exposes
6380          * /dev/kvm to userspace, i.e. all infrastructure must be setup!
6381          */
6382         r = misc_register(&kvm_dev);
6383         if (r) {
6384                 pr_err("kvm: misc device register failed\n");
6385                 goto err_register;
6386         }
6387
6388         return 0;
6389
6390 err_register:
6391         kvm_uninit_virtualization();
6392 err_virt:
6393         kvm_vfio_ops_exit();
6394 err_vfio:
6395         kvm_async_pf_deinit();
6396 err_async_pf:
6397         kvm_irqfd_exit();
6398 err_irqfd:
6399 err_cpu_kick_mask:
6400         for_each_possible_cpu(cpu)
6401                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6402         kmem_cache_destroy(kvm_vcpu_cache);
6403         return r;
6404 }
6405 EXPORT_SYMBOL_GPL(kvm_init);
6406
6407 void kvm_exit(void)
6408 {
6409         int cpu;
6410
6411         /*
6412          * Note, unregistering /dev/kvm doesn't strictly need to come first,
6413          * fops_get(), a.k.a. try_module_get(), prevents acquiring references
6414          * to KVM while the module is being stopped.
6415          */
6416         misc_deregister(&kvm_dev);
6417
6418         kvm_uninit_virtualization();
6419
6420         debugfs_remove_recursive(kvm_debugfs_dir);
6421         for_each_possible_cpu(cpu)
6422                 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6423         kmem_cache_destroy(kvm_vcpu_cache);
6424         kvm_vfio_ops_exit();
6425         kvm_async_pf_deinit();
6426         kvm_irqfd_exit();
6427 }
6428 EXPORT_SYMBOL_GPL(kvm_exit);