drivers/vfio/vfio_iommu_type1.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
   4  *
   5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6  *     Author: Alex Williamson <[email protected]>
   7  *
   8  * Derived from original vfio:
   9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10  * Author: Tom Lyon, [email protected]
  11  *
  12  * We arbitrarily define a Type1 IOMMU as one matching the below code.
  13  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  14  * VT-d, but that makes it harder to re-use as theoretically anyone
  15  * implementing a similar IOMMU could make use of this.  We expect the
  16  * IOMMU to support the IOMMU API and have few to no restrictions around
  17  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  18  * optimized for relatively static mappings of a userspace process with
  19  * userspace pages pinned into memory.  We also assume devices and IOMMU
  20  * domains are PCI based as the IOMMU API is still centered around a
  21  * device/bus interface rather than a group interface.
  22  */
  23
  24 #include <linux/compat.h>
  25 #include <linux/device.h>
  26 #include <linux/fs.h>
  27 #include <linux/highmem.h>
  28 #include <linux/iommu.h>
  29 #include <linux/module.h>
  30 #include <linux/mm.h>
  31 #include <linux/kthread.h>
  32 #include <linux/rbtree.h>
  33 #include <linux/sched/signal.h>
  34 #include <linux/sched/mm.h>
  35 #include <linux/slab.h>
  36 #include <linux/uaccess.h>
  37 #include <linux/vfio.h>
  38 #include <linux/workqueue.h>
  39 #include <linux/notifier.h>
  40 #include "vfio.h"
  41
  42 #define DRIVER_VERSION  "0.2"
  43 #define DRIVER_AUTHOR   "Alex Williamson <[email protected]>"
  44 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
  45
  46 static bool allow_unsafe_interrupts;
  47 module_param_named(allow_unsafe_interrupts,
  48                    allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  49 MODULE_PARM_DESC(allow_unsafe_interrupts,
  50                  "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  51
  52 static bool disable_hugepages;
  53 module_param_named(disable_hugepages,
  54                    disable_hugepages, bool, S_IRUGO | S_IWUSR);
  55 MODULE_PARM_DESC(disable_hugepages,
  56                  "Disable VFIO IOMMU support for IOMMU hugepages.");
  57
  58 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
  59 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
  60 MODULE_PARM_DESC(dma_entry_limit,
  61                  "Maximum number of user DMA mappings per container (65535).");
  62
  63 struct vfio_iommu {
  64         struct list_head        domain_list;
  65         struct list_head        iova_list;
  66         struct mutex            lock;
  67         struct rb_root          dma_list;
  68         struct list_head        device_list;
  69         struct mutex            device_list_lock;
  70         unsigned int            dma_avail;
  71         unsigned int            vaddr_invalid_count;
  72         uint64_t                pgsize_bitmap;
  73         uint64_t                num_non_pinned_groups;
  74         bool                    v2;
  75         bool                    dirty_page_tracking;
  76         struct list_head        emulated_iommu_groups;
  77 };
  78
  79 struct vfio_domain {
  80         struct iommu_domain     *domain;
  81         struct list_head        next;
  82         struct list_head        group_list;
  83         bool                    fgsp : 1;       /* Fine-grained super pages */
  84         bool                    enforce_cache_coherency : 1;
  85 };
  86
  87 struct vfio_dma {
  88         struct rb_node          node;
  89         dma_addr_t              iova;           /* Device address */
  90         unsigned long           vaddr;          /* Process virtual addr */
  91         size_t                  size;           /* Map size (bytes) */
  92         int                     prot;           /* IOMMU_READ/WRITE */
  93         bool                    iommu_mapped;
  94         bool                    lock_cap;       /* capable(CAP_IPC_LOCK) */
  95         bool                    vaddr_invalid;
  96         struct task_struct      *task;
  97         struct rb_root          pfn_list;       /* Ex-user pinned pfn list */
  98         unsigned long           *bitmap;
  99         struct mm_struct        *mm;
 100         size_t                  locked_vm;
 101 };
 102
 103 struct vfio_batch {
 104         struct page             **pages;        /* for pin_user_pages_remote */
 105         struct page             *fallback_page; /* if pages alloc fails */
 106         int                     capacity;       /* length of pages array */
 107         int                     size;           /* of batch currently */
 108         int                     offset;         /* of next entry in pages */
 109 };
 110
 111 struct vfio_iommu_group {
 112         struct iommu_group      *iommu_group;
 113         struct list_head        next;
 114         bool                    pinned_page_dirty_scope;
 115 };
 116
 117 struct vfio_iova {
 118         struct list_head        list;
 119         dma_addr_t              start;
 120         dma_addr_t              end;
 121 };
 122
 123 /*
 124  * Guest RAM pinning working set or DMA target
 125  */
 126 struct vfio_pfn {
 127         struct rb_node          node;
 128         dma_addr_t              iova;           /* Device address */
 129         unsigned long           pfn;            /* Host pfn */
 130         unsigned int            ref_count;
 131 };
 132
 133 struct vfio_regions {
 134         struct list_head list;
 135         dma_addr_t iova;
 136         phys_addr_t phys;
 137         size_t len;
 138 };
 139
 140 #define DIRTY_BITMAP_BYTES(n)   (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
 141
 142 /*
 143  * Input argument of number of bits to bitmap_set() is unsigned integer, which
 144  * further casts to signed integer for unaligned multi-bit operation,
 145  * __bitmap_set().
 146  * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
 147  * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
 148  * system.
 149  */
 150 #define DIRTY_BITMAP_PAGES_MAX   ((u64)INT_MAX)
 151 #define DIRTY_BITMAP_SIZE_MAX    DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 152
 153 static int put_pfn(unsigned long pfn, int prot);
 154
 155 static struct vfio_iommu_group*
 156 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
 157                             struct iommu_group *iommu_group);
 158
 159 /*
 160  * This code handles mapping and unmapping of user data buffers
 161  * into DMA'ble space using the IOMMU
 162  */
 163
 164 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 165                                       dma_addr_t start, size_t size)
 166 {
 167         struct rb_node *node = iommu->dma_list.rb_node;
 168
 169         while (node) {
 170                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 171
 172                 if (start + size <= dma->iova)
 173                         node = node->rb_left;
 174                 else if (start >= dma->iova + dma->size)
 175                         node = node->rb_right;
 176                 else
 177                         return dma;
 178         }
 179
 180         return NULL;
 181 }
 182
 183 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
 184                                                 dma_addr_t start, u64 size)
 185 {
 186         struct rb_node *res = NULL;
 187         struct rb_node *node = iommu->dma_list.rb_node;
 188         struct vfio_dma *dma_res = NULL;
 189
 190         while (node) {
 191                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 192
 193                 if (start < dma->iova + dma->size) {
 194                         res = node;
 195                         dma_res = dma;
 196                         if (start >= dma->iova)
 197                                 break;
 198                         node = node->rb_left;
 199                 } else {
 200                         node = node->rb_right;
 201                 }
 202         }
 203         if (res && size && dma_res->iova >= start + size)
 204                 res = NULL;
 205         return res;
 206 }
 207
 208 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 209 {
 210         struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
 211         struct vfio_dma *dma;
 212
 213         while (*link) {
 214                 parent = *link;
 215                 dma = rb_entry(parent, struct vfio_dma, node);
 216
 217                 if (new->iova + new->size <= dma->iova)
 218                         link = &(*link)->rb_left;
 219                 else
 220                         link = &(*link)->rb_right;
 221         }
 222
 223         rb_link_node(&new->node, parent, link);
 224         rb_insert_color(&new->node, &iommu->dma_list);
 225 }
 226
 227 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 228 {
 229         rb_erase(&old->node, &iommu->dma_list);
 230 }
 231
 232
 233 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
 234 {
 235         uint64_t npages = dma->size / pgsize;
 236
 237         if (npages > DIRTY_BITMAP_PAGES_MAX)
 238                 return -EINVAL;
 239
 240         /*
 241          * Allocate extra 64 bits that are used to calculate shift required for
 242          * bitmap_shift_left() to manipulate and club unaligned number of pages
 243          * in adjacent vfio_dma ranges.
 244          */
 245         dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
 246                                GFP_KERNEL);
 247         if (!dma->bitmap)
 248                 return -ENOMEM;
 249
 250         return 0;
 251 }
 252
 253 static void vfio_dma_bitmap_free(struct vfio_dma *dma)
 254 {
 255         kvfree(dma->bitmap);
 256         dma->bitmap = NULL;
 257 }
 258
 259 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
 260 {
 261         struct rb_node *p;
 262         unsigned long pgshift = __ffs(pgsize);
 263
 264         for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
 265                 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
 266
 267                 bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
 268         }
 269 }
 270
 271 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
 272 {
 273         struct rb_node *n;
 274         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
 275
 276         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 277                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 278
 279                 bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
 280         }
 281 }
 282
 283 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
 284 {
 285         struct rb_node *n;
 286
 287         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 288                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 289                 int ret;
 290
 291                 ret = vfio_dma_bitmap_alloc(dma, pgsize);
 292                 if (ret) {
 293                         struct rb_node *p;
 294
 295                         for (p = rb_prev(n); p; p = rb_prev(p)) {
 296                                 struct vfio_dma *dma = rb_entry(n,
 297                                                         struct vfio_dma, node);
 298
 299                                 vfio_dma_bitmap_free(dma);
 300                         }
 301                         return ret;
 302                 }
 303                 vfio_dma_populate_bitmap(dma, pgsize);
 304         }
 305         return 0;
 306 }
 307
 308 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
 309 {
 310         struct rb_node *n;
 311
 312         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 313                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 314
 315                 vfio_dma_bitmap_free(dma);
 316         }
 317 }
 318
 319 /*
 320  * Helper Functions for host iova-pfn list
 321  */
 322 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
 323 {
 324         struct vfio_pfn *vpfn;
 325         struct rb_node *node = dma->pfn_list.rb_node;
 326
 327         while (node) {
 328                 vpfn = rb_entry(node, struct vfio_pfn, node);
 329
 330                 if (iova < vpfn->iova)
 331                         node = node->rb_left;
 332                 else if (iova > vpfn->iova)
 333                         node = node->rb_right;
 334                 else
 335                         return vpfn;
 336         }
 337         return NULL;
 338 }
 339
 340 static void vfio_link_pfn(struct vfio_dma *dma,
 341                           struct vfio_pfn *new)
 342 {
 343         struct rb_node **link, *parent = NULL;
 344         struct vfio_pfn *vpfn;
 345
 346         link = &dma->pfn_list.rb_node;
 347         while (*link) {
 348                 parent = *link;
 349                 vpfn = rb_entry(parent, struct vfio_pfn, node);
 350
 351                 if (new->iova < vpfn->iova)
 352                         link = &(*link)->rb_left;
 353                 else
 354                         link = &(*link)->rb_right;
 355         }
 356
 357         rb_link_node(&new->node, parent, link);
 358         rb_insert_color(&new->node, &dma->pfn_list);
 359 }
 360
 361 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
 362 {
 363         rb_erase(&old->node, &dma->pfn_list);
 364 }
 365
 366 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
 367                                 unsigned long pfn)
 368 {
 369         struct vfio_pfn *vpfn;
 370
 371         vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
 372         if (!vpfn)
 373                 return -ENOMEM;
 374
 375         vpfn->iova = iova;
 376         vpfn->pfn = pfn;
 377         vpfn->ref_count = 1;
 378         vfio_link_pfn(dma, vpfn);
 379         return 0;
 380 }
 381
 382 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
 383                                       struct vfio_pfn *vpfn)
 384 {
 385         vfio_unlink_pfn(dma, vpfn);
 386         kfree(vpfn);
 387 }
 388
 389 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
 390                                                unsigned long iova)
 391 {
 392         struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 393
 394         if (vpfn)
 395                 vpfn->ref_count++;
 396         return vpfn;
 397 }
 398
 399 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
 400 {
 401         int ret = 0;
 402
 403         vpfn->ref_count--;
 404         if (!vpfn->ref_count) {
 405                 ret = put_pfn(vpfn->pfn, dma->prot);
 406                 vfio_remove_from_pfn_list(dma, vpfn);
 407         }
 408         return ret;
 409 }
 410
 411 static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm,
 412                         bool lock_cap, long npage)
 413 {
 414         int ret = mmap_write_lock_killable(mm);
 415
 416         if (ret)
 417                 return ret;
 418
 419         ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap);
 420         mmap_write_unlock(mm);
 421         return ret;
 422 }
 423
 424 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 425 {
 426         struct mm_struct *mm;
 427         int ret;
 428
 429         if (!npage)
 430                 return 0;
 431
 432         mm = dma->mm;
 433         if (async && !mmget_not_zero(mm))
 434                 return -ESRCH; /* process exited */
 435
 436         ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage);
 437         if (!ret)
 438                 dma->locked_vm += npage;
 439
 440         if (async)
 441                 mmput(mm);
 442
 443         return ret;
 444 }
 445
 446 /*
 447  * Some mappings aren't backed by a struct page, for example an mmap'd
 448  * MMIO range for our own or another device.  These use a different
 449  * pfn conversion and shouldn't be tracked as locked pages.
 450  * For compound pages, any driver that sets the reserved bit in head
 451  * page needs to set the reserved bit in all subpages to be safe.
 452  */
 453 static bool is_invalid_reserved_pfn(unsigned long pfn)
 454 {
 455         if (pfn_valid(pfn))
 456                 return PageReserved(pfn_to_page(pfn));
 457
 458         return true;
 459 }
 460
 461 static int put_pfn(unsigned long pfn, int prot)
 462 {
 463         if (!is_invalid_reserved_pfn(pfn)) {
 464                 struct page *page = pfn_to_page(pfn);
 465
 466                 unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
 467                 return 1;
 468         }
 469         return 0;
 470 }
 471
 472 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
 473
 474 static void vfio_batch_init(struct vfio_batch *batch)
 475 {
 476         batch->size = 0;
 477         batch->offset = 0;
 478
 479         if (unlikely(disable_hugepages))
 480                 goto fallback;
 481
 482         batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
 483         if (!batch->pages)
 484                 goto fallback;
 485
 486         batch->capacity = VFIO_BATCH_MAX_CAPACITY;
 487         return;
 488
 489 fallback:
 490         batch->pages = &batch->fallback_page;
 491         batch->capacity = 1;
 492 }
 493
 494 static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
 495 {
 496         while (batch->size) {
 497                 unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
 498
 499                 put_pfn(pfn, dma->prot);
 500                 batch->offset++;
 501                 batch->size--;
 502         }
 503 }
 504
 505 static void vfio_batch_fini(struct vfio_batch *batch)
 506 {
 507         if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
 508                 free_page((unsigned long)batch->pages);
 509 }
 510
 511 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 512                             unsigned long vaddr, unsigned long *pfn,
 513                             bool write_fault)
 514 {
 515         struct follow_pfnmap_args args = { .vma = vma, .address = vaddr };
 516         int ret;
 517
 518         ret = follow_pfnmap_start(&args);
 519         if (ret) {
 520                 bool unlocked = false;
 521
 522                 ret = fixup_user_fault(mm, vaddr,
 523                                        FAULT_FLAG_REMOTE |
 524                                        (write_fault ?  FAULT_FLAG_WRITE : 0),
 525                                        &unlocked);
 526                 if (unlocked)
 527                         return -EAGAIN;
 528
 529                 if (ret)
 530                         return ret;
 531
 532                 ret = follow_pfnmap_start(&args);
 533                 if (ret)
 534                         return ret;
 535         }
 536
 537         if (write_fault && !args.writable)
 538                 ret = -EFAULT;
 539         else
 540                 *pfn = args.pfn;
 541
 542         follow_pfnmap_end(&args);
 543         return ret;
 544 }
 545
 546 /*
 547  * Returns the positive number of pfns successfully obtained or a negative
 548  * error code.
 549  */
 550 static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
 551                           long npages, int prot, unsigned long *pfn,
 552                           struct page **pages)
 553 {
 554         struct vm_area_struct *vma;
 555         unsigned int flags = 0;
 556         int ret;
 557
 558         if (prot & IOMMU_WRITE)
 559                 flags |= FOLL_WRITE;
 560
 561         mmap_read_lock(mm);
 562         ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
 563                                     pages, NULL);
 564         if (ret > 0) {
 565                 *pfn = page_to_pfn(pages[0]);
 566                 goto done;
 567         }
 568
 569         vaddr = untagged_addr_remote(mm, vaddr);
 570
 571 retry:
 572         vma = vma_lookup(mm, vaddr);
 573
 574         if (vma && vma->vm_flags & VM_PFNMAP) {
 575                 ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
 576                 if (ret == -EAGAIN)
 577                         goto retry;
 578
 579                 if (!ret) {
 580                         if (is_invalid_reserved_pfn(*pfn))
 581                                 ret = 1;
 582                         else
 583                                 ret = -EFAULT;
 584                 }
 585         }
 586 done:
 587         mmap_read_unlock(mm);
 588         return ret;
 589 }
 590
 591 /*
 592  * Attempt to pin pages.  We really don't want to track all the pfns and
 593  * the iommu can only map chunks of consecutive pfns anyway, so get the
 594  * first page and all consecutive pages with the same locking.
 595  */
 596 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 597                                   long npage, unsigned long *pfn_base,
 598                                   unsigned long limit, struct vfio_batch *batch)
 599 {
 600         unsigned long pfn;
 601         struct mm_struct *mm = current->mm;
 602         long ret, pinned = 0, lock_acct = 0;
 603         bool rsvd;
 604         dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
 605
 606         /* This code path is only user initiated */
 607         if (!mm)
 608                 return -ENODEV;
 609
 610         if (batch->size) {
 611                 /* Leftover pages in batch from an earlier call. */
 612                 *pfn_base = page_to_pfn(batch->pages[batch->offset]);
 613                 pfn = *pfn_base;
 614                 rsvd = is_invalid_reserved_pfn(*pfn_base);
 615         } else {
 616                 *pfn_base = 0;
 617         }
 618
 619         while (npage) {
 620                 if (!batch->size) {
 621                         /* Empty batch, so refill it. */
 622                         long req_pages = min_t(long, npage, batch->capacity);
 623
 624                         ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
 625                                              &pfn, batch->pages);
 626                         if (ret < 0)
 627                                 goto unpin_out;
 628
 629                         batch->size = ret;
 630                         batch->offset = 0;
 631
 632                         if (!*pfn_base) {
 633                                 *pfn_base = pfn;
 634                                 rsvd = is_invalid_reserved_pfn(*pfn_base);
 635                         }
 636                 }
 637
 638                 /*
 639                  * pfn is preset for the first iteration of this inner loop and
 640                  * updated at the end to handle a VM_PFNMAP pfn.  In that case,
 641                  * batch->pages isn't valid (there's no struct page), so allow
 642                  * batch->pages to be touched only when there's more than one
 643                  * pfn to check, which guarantees the pfns are from a
 644                  * !VM_PFNMAP vma.
 645                  */
 646                 while (true) {
 647                         if (pfn != *pfn_base + pinned ||
 648                             rsvd != is_invalid_reserved_pfn(pfn))
 649                                 goto out;
 650
 651                         /*
 652                          * Reserved pages aren't counted against the user,
 653                          * externally pinned pages are already counted against
 654                          * the user.
 655                          */
 656                         if (!rsvd && !vfio_find_vpfn(dma, iova)) {
 657                                 if (!dma->lock_cap &&
 658                                     mm->locked_vm + lock_acct + 1 > limit) {
 659                                         pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 660                                                 __func__, limit << PAGE_SHIFT);
 661                                         ret = -ENOMEM;
 662                                         goto unpin_out;
 663                                 }
 664                                 lock_acct++;
 665                         }
 666
 667                         pinned++;
 668                         npage--;
 669                         vaddr += PAGE_SIZE;
 670                         iova += PAGE_SIZE;
 671                         batch->offset++;
 672                         batch->size--;
 673
 674                         if (!batch->size)
 675                                 break;
 676
 677                         pfn = page_to_pfn(batch->pages[batch->offset]);
 678                 }
 679
 680                 if (unlikely(disable_hugepages))
 681                         break;
 682         }
 683
 684 out:
 685         ret = vfio_lock_acct(dma, lock_acct, false);
 686
 687 unpin_out:
 688         if (batch->size == 1 && !batch->offset) {
 689                 /* May be a VM_PFNMAP pfn, which the batch can't remember. */
 690                 put_pfn(pfn, dma->prot);
 691                 batch->size = 0;
 692         }
 693
 694         if (ret < 0) {
 695                 if (pinned && !rsvd) {
 696                         for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
 697                                 put_pfn(pfn, dma->prot);
 698                 }
 699                 vfio_batch_unpin(batch, dma);
 700
 701                 return ret;
 702         }
 703
 704         return pinned;
 705 }
 706
 707 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 708                                     unsigned long pfn, long npage,
 709                                     bool do_accounting)
 710 {
 711         long unlocked = 0, locked = 0;
 712         long i;
 713
 714         for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
 715                 if (put_pfn(pfn++, dma->prot)) {
 716                         unlocked++;
 717                         if (vfio_find_vpfn(dma, iova))
 718                                 locked++;
 719                 }
 720         }
 721
 722         if (do_accounting)
 723                 vfio_lock_acct(dma, locked - unlocked, true);
 724
 725         return unlocked;
 726 }
 727
 728 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 729                                   unsigned long *pfn_base, bool do_accounting)
 730 {
 731         struct page *pages[1];
 732         struct mm_struct *mm;
 733         int ret;
 734
 735         mm = dma->mm;
 736         if (!mmget_not_zero(mm))
 737                 return -ENODEV;
 738
 739         ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
 740         if (ret != 1)
 741                 goto out;
 742
 743         ret = 0;
 744
 745         if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
 746                 ret = vfio_lock_acct(dma, 1, false);
 747                 if (ret) {
 748                         put_pfn(*pfn_base, dma->prot);
 749                         if (ret == -ENOMEM)
 750                                 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
 751                                         "(%ld) exceeded\n", __func__,
 752                                         dma->task->comm, task_pid_nr(dma->task),
 753                                         task_rlimit(dma->task, RLIMIT_MEMLOCK));
 754                 }
 755         }
 756
 757 out:
 758         mmput(mm);
 759         return ret;
 760 }
 761
 762 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 763                                     bool do_accounting)
 764 {
 765         int unlocked;
 766         struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 767
 768         if (!vpfn)
 769                 return 0;
 770
 771         unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
 772
 773         if (do_accounting)
 774                 vfio_lock_acct(dma, -unlocked, true);
 775
 776         return unlocked;
 777 }
 778
 779 static int vfio_iommu_type1_pin_pages(void *iommu_data,
 780                                       struct iommu_group *iommu_group,
 781                                       dma_addr_t user_iova,
 782                                       int npage, int prot,
 783                                       struct page **pages)
 784 {
 785         struct vfio_iommu *iommu = iommu_data;
 786         struct vfio_iommu_group *group;
 787         int i, j, ret;
 788         unsigned long remote_vaddr;
 789         struct vfio_dma *dma;
 790         bool do_accounting;
 791
 792         if (!iommu || !pages)
 793                 return -EINVAL;
 794
 795         /* Supported for v2 version only */
 796         if (!iommu->v2)
 797                 return -EACCES;
 798
 799         mutex_lock(&iommu->lock);
 800
 801         if (WARN_ONCE(iommu->vaddr_invalid_count,
 802                       "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) {
 803                 ret = -EBUSY;
 804                 goto pin_done;
 805         }
 806
 807         /* Fail if no dma_umap notifier is registered */
 808         if (list_empty(&iommu->device_list)) {
 809                 ret = -EINVAL;
 810                 goto pin_done;
 811         }
 812
 813         /*
 814          * If iommu capable domain exist in the container then all pages are
 815          * already pinned and accounted. Accounting should be done if there is no
 816          * iommu capable domain in the container.
 817          */
 818         do_accounting = list_empty(&iommu->domain_list);
 819
 820         for (i = 0; i < npage; i++) {
 821                 unsigned long phys_pfn;
 822                 dma_addr_t iova;
 823                 struct vfio_pfn *vpfn;
 824
 825                 iova = user_iova + PAGE_SIZE * i;
 826                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 827                 if (!dma) {
 828                         ret = -EINVAL;
 829                         goto pin_unwind;
 830                 }
 831
 832                 if ((dma->prot & prot) != prot) {
 833                         ret = -EPERM;
 834                         goto pin_unwind;
 835                 }
 836
 837                 vpfn = vfio_iova_get_vfio_pfn(dma, iova);
 838                 if (vpfn) {
 839                         pages[i] = pfn_to_page(vpfn->pfn);
 840                         continue;
 841                 }
 842
 843                 remote_vaddr = dma->vaddr + (iova - dma->iova);
 844                 ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn,
 845                                              do_accounting);
 846                 if (ret)
 847                         goto pin_unwind;
 848
 849                 if (!pfn_valid(phys_pfn)) {
 850                         ret = -EINVAL;
 851                         goto pin_unwind;
 852                 }
 853
 854                 ret = vfio_add_to_pfn_list(dma, iova, phys_pfn);
 855                 if (ret) {
 856                         if (put_pfn(phys_pfn, dma->prot) && do_accounting)
 857                                 vfio_lock_acct(dma, -1, true);
 858                         goto pin_unwind;
 859                 }
 860
 861                 pages[i] = pfn_to_page(phys_pfn);
 862
 863                 if (iommu->dirty_page_tracking) {
 864                         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
 865
 866                         /*
 867                          * Bitmap populated with the smallest supported page
 868                          * size
 869                          */
 870                         bitmap_set(dma->bitmap,
 871                                    (iova - dma->iova) >> pgshift, 1);
 872                 }
 873         }
 874         ret = i;
 875
 876         group = vfio_iommu_find_iommu_group(iommu, iommu_group);
 877         if (!group->pinned_page_dirty_scope) {
 878                 group->pinned_page_dirty_scope = true;
 879                 iommu->num_non_pinned_groups--;
 880         }
 881
 882         goto pin_done;
 883
 884 pin_unwind:
 885         pages[i] = NULL;
 886         for (j = 0; j < i; j++) {
 887                 dma_addr_t iova;
 888
 889                 iova = user_iova + PAGE_SIZE * j;
 890                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 891                 vfio_unpin_page_external(dma, iova, do_accounting);
 892                 pages[j] = NULL;
 893         }
 894 pin_done:
 895         mutex_unlock(&iommu->lock);
 896         return ret;
 897 }
 898
 899 static void vfio_iommu_type1_unpin_pages(void *iommu_data,
 900                                          dma_addr_t user_iova, int npage)
 901 {
 902         struct vfio_iommu *iommu = iommu_data;
 903         bool do_accounting;
 904         int i;
 905
 906         /* Supported for v2 version only */
 907         if (WARN_ON(!iommu->v2))
 908                 return;
 909
 910         mutex_lock(&iommu->lock);
 911
 912         do_accounting = list_empty(&iommu->domain_list);
 913         for (i = 0; i < npage; i++) {
 914                 dma_addr_t iova = user_iova + PAGE_SIZE * i;
 915                 struct vfio_dma *dma;
 916
 917                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 918                 if (!dma)
 919                         break;
 920
 921                 vfio_unpin_page_external(dma, iova, do_accounting);
 922         }
 923
 924         mutex_unlock(&iommu->lock);
 925
 926         WARN_ON(i != npage);
 927 }
 928
 929 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
 930                             struct list_head *regions,
 931                             struct iommu_iotlb_gather *iotlb_gather)
 932 {
 933         long unlocked = 0;
 934         struct vfio_regions *entry, *next;
 935
 936         iommu_iotlb_sync(domain->domain, iotlb_gather);
 937
 938         list_for_each_entry_safe(entry, next, regions, list) {
 939                 unlocked += vfio_unpin_pages_remote(dma,
 940                                                     entry->iova,
 941                                                     entry->phys >> PAGE_SHIFT,
 942                                                     entry->len >> PAGE_SHIFT,
 943                                                     false);
 944                 list_del(&entry->list);
 945                 kfree(entry);
 946         }
 947
 948         cond_resched();
 949
 950         return unlocked;
 951 }
 952
 953 /*
 954  * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
 955  * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
 956  * of these regions (currently using a list).
 957  *
 958  * This value specifies maximum number of regions for each IOTLB flush sync.
 959  */
 960 #define VFIO_IOMMU_TLB_SYNC_MAX         512
 961
 962 static size_t unmap_unpin_fast(struct vfio_domain *domain,
 963                                struct vfio_dma *dma, dma_addr_t *iova,
 964                                size_t len, phys_addr_t phys, long *unlocked,
 965                                struct list_head *unmapped_list,
 966                                int *unmapped_cnt,
 967                                struct iommu_iotlb_gather *iotlb_gather)
 968 {
 969         size_t unmapped = 0;
 970         struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 971
 972         if (entry) {
 973                 unmapped = iommu_unmap_fast(domain->domain, *iova, len,
 974                                             iotlb_gather);
 975
 976                 if (!unmapped) {
 977                         kfree(entry);
 978                 } else {
 979                         entry->iova = *iova;
 980                         entry->phys = phys;
 981                         entry->len  = unmapped;
 982                         list_add_tail(&entry->list, unmapped_list);
 983
 984                         *iova += unmapped;
 985                         (*unmapped_cnt)++;
 986                 }
 987         }
 988
 989         /*
 990          * Sync if the number of fast-unmap regions hits the limit
 991          * or in case of errors.
 992          */
 993         if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
 994                 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
 995                                              iotlb_gather);
 996                 *unmapped_cnt = 0;
 997         }
 998
 999         return unmapped;
1000 }
1001
1002 static size_t unmap_unpin_slow(struct vfio_domain *domain,
1003                                struct vfio_dma *dma, dma_addr_t *iova,
1004                                size_t len, phys_addr_t phys,
1005                                long *unlocked)
1006 {
1007         size_t unmapped = iommu_unmap(domain->domain, *iova, len);
1008
1009         if (unmapped) {
1010                 *unlocked += vfio_unpin_pages_remote(dma, *iova,
1011                                                      phys >> PAGE_SHIFT,
1012                                                      unmapped >> PAGE_SHIFT,
1013                                                      false);
1014                 *iova += unmapped;
1015                 cond_resched();
1016         }
1017         return unmapped;
1018 }
1019
1020 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
1021                              bool do_accounting)
1022 {
1023         dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
1024         struct vfio_domain *domain, *d;
1025         LIST_HEAD(unmapped_region_list);
1026         struct iommu_iotlb_gather iotlb_gather;
1027         int unmapped_region_cnt = 0;
1028         long unlocked = 0;
1029
1030         if (!dma->size)
1031                 return 0;
1032
1033         if (list_empty(&iommu->domain_list))
1034                 return 0;
1035
1036         /*
1037          * We use the IOMMU to track the physical addresses, otherwise we'd
1038          * need a much more complicated tracking system.  Unfortunately that
1039          * means we need to use one of the iommu domains to figure out the
1040          * pfns to unpin.  The rest need to be unmapped in advance so we have
1041          * no iommu translations remaining when the pages are unpinned.
1042          */
1043         domain = d = list_first_entry(&iommu->domain_list,
1044                                       struct vfio_domain, next);
1045
1046         list_for_each_entry_continue(d, &iommu->domain_list, next) {
1047                 iommu_unmap(d->domain, dma->iova, dma->size);
1048                 cond_resched();
1049         }
1050
1051         iommu_iotlb_gather_init(&iotlb_gather);
1052         while (iova < end) {
1053                 size_t unmapped, len;
1054                 phys_addr_t phys, next;
1055
1056                 phys = iommu_iova_to_phys(domain->domain, iova);
1057                 if (WARN_ON(!phys)) {
1058                         iova += PAGE_SIZE;
1059                         continue;
1060                 }
1061
1062                 /*
1063                  * To optimize for fewer iommu_unmap() calls, each of which
1064                  * may require hardware cache flushing, try to find the
1065                  * largest contiguous physical memory chunk to unmap.
1066                  */
1067                 for (len = PAGE_SIZE;
1068                      !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
1069                         next = iommu_iova_to_phys(domain->domain, iova + len);
1070                         if (next != phys + len)
1071                                 break;
1072                 }
1073
1074                 /*
1075                  * First, try to use fast unmap/unpin. In case of failure,
1076                  * switch to slow unmap/unpin path.
1077                  */
1078                 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
1079                                             &unlocked, &unmapped_region_list,
1080                                             &unmapped_region_cnt,
1081                                             &iotlb_gather);
1082                 if (!unmapped) {
1083                         unmapped = unmap_unpin_slow(domain, dma, &iova, len,
1084                                                     phys, &unlocked);
1085                         if (WARN_ON(!unmapped))
1086                                 break;
1087                 }
1088         }
1089
1090         dma->iommu_mapped = false;
1091
1092         if (unmapped_region_cnt) {
1093                 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
1094                                             &iotlb_gather);
1095         }
1096
1097         if (do_accounting) {
1098                 vfio_lock_acct(dma, -unlocked, true);
1099                 return 0;
1100         }
1101         return unlocked;
1102 }
1103
1104 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
1105 {
1106         WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
1107         vfio_unmap_unpin(iommu, dma, true);
1108         vfio_unlink_dma(iommu, dma);
1109         put_task_struct(dma->task);
1110         mmdrop(dma->mm);
1111         vfio_dma_bitmap_free(dma);
1112         if (dma->vaddr_invalid)
1113                 iommu->vaddr_invalid_count--;
1114         kfree(dma);
1115         iommu->dma_avail++;
1116 }
1117
1118 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
1119 {
1120         struct vfio_domain *domain;
1121
1122         iommu->pgsize_bitmap = ULONG_MAX;
1123
1124         list_for_each_entry(domain, &iommu->domain_list, next)
1125                 iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
1126
1127         /*
1128          * In case the IOMMU supports page sizes smaller than PAGE_SIZE
1129          * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1130          * That way the user will be able to map/unmap buffers whose size/
1131          * start address is aligned with PAGE_SIZE. Pinning code uses that
1132          * granularity while iommu driver can use the sub-PAGE_SIZE size
1133          * to map the buffer.
1134          */
1135         if (iommu->pgsize_bitmap & ~PAGE_MASK) {
1136                 iommu->pgsize_bitmap &= PAGE_MASK;
1137                 iommu->pgsize_bitmap |= PAGE_SIZE;
1138         }
1139 }
1140
1141 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1142                               struct vfio_dma *dma, dma_addr_t base_iova,
1143                               size_t pgsize)
1144 {
1145         unsigned long pgshift = __ffs(pgsize);
1146         unsigned long nbits = dma->size >> pgshift;
1147         unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1148         unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1149         unsigned long shift = bit_offset % BITS_PER_LONG;
1150         unsigned long leftover;
1151
1152         /*
1153          * mark all pages dirty if any IOMMU capable device is not able
1154          * to report dirty pages and all pages are pinned and mapped.
1155          */
1156         if (iommu->num_non_pinned_groups && dma->iommu_mapped)
1157                 bitmap_set(dma->bitmap, 0, nbits);
1158
1159         if (shift) {
1160                 bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1161                                   nbits + shift);
1162
1163                 if (copy_from_user(&leftover,
1164                                    (void __user *)(bitmap + copy_offset),
1165                                    sizeof(leftover)))
1166                         return -EFAULT;
1167
1168                 bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1169         }
1170
1171         if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1172                          DIRTY_BITMAP_BYTES(nbits + shift)))
1173                 return -EFAULT;
1174
1175         return 0;
1176 }
1177
1178 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1179                                   dma_addr_t iova, size_t size, size_t pgsize)
1180 {
1181         struct vfio_dma *dma;
1182         struct rb_node *n;
1183         unsigned long pgshift = __ffs(pgsize);
1184         int ret;
1185
1186         /*
1187          * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
1188          * vfio_dma mappings may be clubbed by specifying large ranges, but
1189          * there must not be any previous mappings bisected by the range.
1190          * An error will be returned if these conditions are not met.
1191          */
1192         dma = vfio_find_dma(iommu, iova, 1);
1193         if (dma && dma->iova != iova)
1194                 return -EINVAL;
1195
1196         dma = vfio_find_dma(iommu, iova + size - 1, 0);
1197         if (dma && dma->iova + dma->size != iova + size)
1198                 return -EINVAL;
1199
1200         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1201                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1202
1203                 if (dma->iova < iova)
1204                         continue;
1205
1206                 if (dma->iova > iova + size - 1)
1207                         break;
1208
1209                 ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1210                 if (ret)
1211                         return ret;
1212
1213                 /*
1214                  * Re-populate bitmap to include all pinned pages which are
1215                  * considered as dirty but exclude pages which are unpinned and
1216                  * pages which are marked dirty by vfio_dma_rw()
1217                  */
1218                 bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1219                 vfio_dma_populate_bitmap(dma, pgsize);
1220         }
1221         return 0;
1222 }
1223
1224 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1225 {
1226         if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1227             (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1228                 return -EINVAL;
1229
1230         return 0;
1231 }
1232
1233 /*
1234  * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate
1235  * and unmap iovas within the range we're about to unmap. Drivers MUST unpin
1236  * pages in response to an invalidation.
1237  */
1238 static void vfio_notify_dma_unmap(struct vfio_iommu *iommu,
1239                                   struct vfio_dma *dma)
1240 {
1241         struct vfio_device *device;
1242
1243         if (list_empty(&iommu->device_list))
1244                 return;
1245
1246         /*
1247          * The device is expected to call vfio_unpin_pages() for any IOVA it has
1248          * pinned within the range. Since vfio_unpin_pages() will eventually
1249          * call back down to this code and try to obtain the iommu->lock we must
1250          * drop it.
1251          */
1252         mutex_lock(&iommu->device_list_lock);
1253         mutex_unlock(&iommu->lock);
1254
1255         list_for_each_entry(device, &iommu->device_list, iommu_entry)
1256                 device->ops->dma_unmap(device, dma->iova, dma->size);
1257
1258         mutex_unlock(&iommu->device_list_lock);
1259         mutex_lock(&iommu->lock);
1260 }
1261
1262 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1263                              struct vfio_iommu_type1_dma_unmap *unmap,
1264                              struct vfio_bitmap *bitmap)
1265 {
1266         struct vfio_dma *dma, *dma_last = NULL;
1267         size_t unmapped = 0, pgsize;
1268         int ret = -EINVAL, retries = 0;
1269         unsigned long pgshift;
1270         dma_addr_t iova = unmap->iova;
1271         u64 size = unmap->size;
1272         bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
1273         bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
1274         struct rb_node *n, *first_n;
1275
1276         mutex_lock(&iommu->lock);
1277
1278         /* Cannot update vaddr if mdev is present. */
1279         if (invalidate_vaddr && !list_empty(&iommu->emulated_iommu_groups)) {
1280                 ret = -EBUSY;
1281                 goto unlock;
1282         }
1283
1284         pgshift = __ffs(iommu->pgsize_bitmap);
1285         pgsize = (size_t)1 << pgshift;
1286
1287         if (iova & (pgsize - 1))
1288                 goto unlock;
1289
1290         if (unmap_all) {
1291                 if (iova || size)
1292                         goto unlock;
1293                 size = U64_MAX;
1294         } else if (!size || size & (pgsize - 1) ||
1295                    iova + size - 1 < iova || size > SIZE_MAX) {
1296                 goto unlock;
1297         }
1298
1299         /* When dirty tracking is enabled, allow only min supported pgsize */
1300         if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1301             (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1302                 goto unlock;
1303         }
1304
1305         WARN_ON((pgsize - 1) & PAGE_MASK);
1306 again:
1307         /*
1308          * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1309          * avoid tracking individual mappings.  This means that the granularity
1310          * of the original mapping was lost and the user was allowed to attempt
1311          * to unmap any range.  Depending on the contiguousness of physical
1312          * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1313          * or may not have worked.  We only guaranteed unmap granularity
1314          * matching the original mapping; even though it was untracked here,
1315          * the original mappings are reflected in IOMMU mappings.  This
1316          * resulted in a couple unusual behaviors.  First, if a range is not
1317          * able to be unmapped, ex. a set of 4k pages that was mapped as a
1318          * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1319          * a zero sized unmap.  Also, if an unmap request overlaps the first
1320          * address of a hugepage, the IOMMU will unmap the entire hugepage.
1321          * This also returns success and the returned unmap size reflects the
1322          * actual size unmapped.
1323          *
1324          * We attempt to maintain compatibility with this "v1" interface, but
1325          * we take control out of the hands of the IOMMU.  Therefore, an unmap
1326          * request offset from the beginning of the original mapping will
1327          * return success with zero sized unmap.  And an unmap request covering
1328          * the first iova of mapping will unmap the entire range.
1329          *
1330          * The v2 version of this interface intends to be more deterministic.
1331          * Unmap requests must fully cover previous mappings.  Multiple
1332          * mappings may still be unmaped by specifying large ranges, but there
1333          * must not be any previous mappings bisected by the range.  An error
1334          * will be returned if these conditions are not met.  The v2 interface
1335          * will only return success and a size of zero if there were no
1336          * mappings within the range.
1337          */
1338         if (iommu->v2 && !unmap_all) {
1339                 dma = vfio_find_dma(iommu, iova, 1);
1340                 if (dma && dma->iova != iova)
1341                         goto unlock;
1342
1343                 dma = vfio_find_dma(iommu, iova + size - 1, 0);
1344                 if (dma && dma->iova + dma->size != iova + size)
1345                         goto unlock;
1346         }
1347
1348         ret = 0;
1349         n = first_n = vfio_find_dma_first_node(iommu, iova, size);
1350
1351         while (n) {
1352                 dma = rb_entry(n, struct vfio_dma, node);
1353                 if (dma->iova >= iova + size)
1354                         break;
1355
1356                 if (!iommu->v2 && iova > dma->iova)
1357                         break;
1358
1359                 if (invalidate_vaddr) {
1360                         if (dma->vaddr_invalid) {
1361                                 struct rb_node *last_n = n;
1362
1363                                 for (n = first_n; n != last_n; n = rb_next(n)) {
1364                                         dma = rb_entry(n,
1365                                                        struct vfio_dma, node);
1366                                         dma->vaddr_invalid = false;
1367                                         iommu->vaddr_invalid_count--;
1368                                 }
1369                                 ret = -EINVAL;
1370                                 unmapped = 0;
1371                                 break;
1372                         }
1373                         dma->vaddr_invalid = true;
1374                         iommu->vaddr_invalid_count++;
1375                         unmapped += dma->size;
1376                         n = rb_next(n);
1377                         continue;
1378                 }
1379
1380                 if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1381                         if (dma_last == dma) {
1382                                 BUG_ON(++retries > 10);
1383                         } else {
1384                                 dma_last = dma;
1385                                 retries = 0;
1386                         }
1387
1388                         vfio_notify_dma_unmap(iommu, dma);
1389                         goto again;
1390                 }
1391
1392                 if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1393                         ret = update_user_bitmap(bitmap->data, iommu, dma,
1394                                                  iova, pgsize);
1395                         if (ret)
1396                                 break;
1397                 }
1398
1399                 unmapped += dma->size;
1400                 n = rb_next(n);
1401                 vfio_remove_dma(iommu, dma);
1402         }
1403
1404 unlock:
1405         mutex_unlock(&iommu->lock);
1406
1407         /* Report how much was unmapped */
1408         unmap->size = unmapped;
1409
1410         return ret;
1411 }
1412
1413 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1414                           unsigned long pfn, long npage, int prot)
1415 {
1416         struct vfio_domain *d;
1417         int ret;
1418
1419         list_for_each_entry(d, &iommu->domain_list, next) {
1420                 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1421                                 npage << PAGE_SHIFT, prot | IOMMU_CACHE,
1422                                 GFP_KERNEL_ACCOUNT);
1423                 if (ret)
1424                         goto unwind;
1425
1426                 cond_resched();
1427         }
1428
1429         return 0;
1430
1431 unwind:
1432         list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1433                 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1434                 cond_resched();
1435         }
1436
1437         return ret;
1438 }
1439
1440 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1441                             size_t map_size)
1442 {
1443         dma_addr_t iova = dma->iova;
1444         unsigned long vaddr = dma->vaddr;
1445         struct vfio_batch batch;
1446         size_t size = map_size;
1447         long npage;
1448         unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1449         int ret = 0;
1450
1451         vfio_batch_init(&batch);
1452
1453         while (size) {
1454                 /* Pin a contiguous chunk of memory */
1455                 npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1456                                               size >> PAGE_SHIFT, &pfn, limit,
1457                                               &batch);
1458                 if (npage <= 0) {
1459                         WARN_ON(!npage);
1460                         ret = (int)npage;
1461                         break;
1462                 }
1463
1464                 /* Map it! */
1465                 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1466                                      dma->prot);
1467                 if (ret) {
1468                         vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1469                                                 npage, true);
1470                         vfio_batch_unpin(&batch, dma);
1471                         break;
1472                 }
1473
1474                 size -= npage << PAGE_SHIFT;
1475                 dma->size += npage << PAGE_SHIFT;
1476         }
1477
1478         vfio_batch_fini(&batch);
1479         dma->iommu_mapped = true;
1480
1481         if (ret)
1482                 vfio_remove_dma(iommu, dma);
1483
1484         return ret;
1485 }
1486
1487 /*
1488  * Check dma map request is within a valid iova range
1489  */
1490 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1491                                       dma_addr_t start, dma_addr_t end)
1492 {
1493         struct list_head *iova = &iommu->iova_list;
1494         struct vfio_iova *node;
1495
1496         list_for_each_entry(node, iova, list) {
1497                 if (start >= node->start && end <= node->end)
1498                         return true;
1499         }
1500
1501         /*
1502          * Check for list_empty() as well since a container with
1503          * a single mdev device will have an empty list.
1504          */
1505         return list_empty(iova);
1506 }
1507
1508 static int vfio_change_dma_owner(struct vfio_dma *dma)
1509 {
1510         struct task_struct *task = current->group_leader;
1511         struct mm_struct *mm = current->mm;
1512         long npage = dma->locked_vm;
1513         bool lock_cap;
1514         int ret;
1515
1516         if (mm == dma->mm)
1517                 return 0;
1518
1519         lock_cap = capable(CAP_IPC_LOCK);
1520         ret = mm_lock_acct(task, mm, lock_cap, npage);
1521         if (ret)
1522                 return ret;
1523
1524         if (mmget_not_zero(dma->mm)) {
1525                 mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage);
1526                 mmput(dma->mm);
1527         }
1528
1529         if (dma->task != task) {
1530                 put_task_struct(dma->task);
1531                 dma->task = get_task_struct(task);
1532         }
1533         mmdrop(dma->mm);
1534         dma->mm = mm;
1535         mmgrab(dma->mm);
1536         dma->lock_cap = lock_cap;
1537         return 0;
1538 }
1539
1540 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1541                            struct vfio_iommu_type1_dma_map *map)
1542 {
1543         bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
1544         dma_addr_t iova = map->iova;
1545         unsigned long vaddr = map->vaddr;
1546         size_t size = map->size;
1547         int ret = 0, prot = 0;
1548         size_t pgsize;
1549         struct vfio_dma *dma;
1550
1551         /* Verify that none of our __u64 fields overflow */
1552         if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1553                 return -EINVAL;
1554
1555         /* READ/WRITE from device perspective */
1556         if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1557                 prot |= IOMMU_WRITE;
1558         if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1559                 prot |= IOMMU_READ;
1560
1561         if ((prot && set_vaddr) || (!prot && !set_vaddr))
1562                 return -EINVAL;
1563
1564         mutex_lock(&iommu->lock);
1565
1566         pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1567
1568         WARN_ON((pgsize - 1) & PAGE_MASK);
1569
1570         if (!size || (size | iova | vaddr) & (pgsize - 1)) {
1571                 ret = -EINVAL;
1572                 goto out_unlock;
1573         }
1574
1575         /* Don't allow IOVA or virtual address wrap */
1576         if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1577                 ret = -EINVAL;
1578                 goto out_unlock;
1579         }
1580
1581         dma = vfio_find_dma(iommu, iova, size);
1582         if (set_vaddr) {
1583                 if (!dma) {
1584                         ret = -ENOENT;
1585                 } else if (!dma->vaddr_invalid || dma->iova != iova ||
1586                            dma->size != size) {
1587                         ret = -EINVAL;
1588                 } else {
1589                         ret = vfio_change_dma_owner(dma);
1590                         if (ret)
1591                                 goto out_unlock;
1592                         dma->vaddr = vaddr;
1593                         dma->vaddr_invalid = false;
1594                         iommu->vaddr_invalid_count--;
1595                 }
1596                 goto out_unlock;
1597         } else if (dma) {
1598                 ret = -EEXIST;
1599                 goto out_unlock;
1600         }
1601
1602         if (!iommu->dma_avail) {
1603                 ret = -ENOSPC;
1604                 goto out_unlock;
1605         }
1606
1607         if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1608                 ret = -EINVAL;
1609                 goto out_unlock;
1610         }
1611
1612         dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1613         if (!dma) {
1614                 ret = -ENOMEM;
1615                 goto out_unlock;
1616         }
1617
1618         iommu->dma_avail--;
1619         dma->iova = iova;
1620         dma->vaddr = vaddr;
1621         dma->prot = prot;
1622
1623         /*
1624          * We need to be able to both add to a task's locked memory and test
1625          * against the locked memory limit and we need to be able to do both
1626          * outside of this call path as pinning can be asynchronous via the
1627          * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
1628          * task_struct. Save the group_leader so that all DMA tracking uses
1629          * the same task, to make debugging easier.  VM locked pages requires
1630          * an mm_struct, so grab the mm in case the task dies.
1631          */
1632         get_task_struct(current->group_leader);
1633         dma->task = current->group_leader;
1634         dma->lock_cap = capable(CAP_IPC_LOCK);
1635         dma->mm = current->mm;
1636         mmgrab(dma->mm);
1637
1638         dma->pfn_list = RB_ROOT;
1639
1640         /* Insert zero-sized and grow as we map chunks of it */
1641         vfio_link_dma(iommu, dma);
1642
1643         /* Don't pin and map if container doesn't contain IOMMU capable domain*/
1644         if (list_empty(&iommu->domain_list))
1645                 dma->size = size;
1646         else
1647                 ret = vfio_pin_map_dma(iommu, dma, size);
1648
1649         if (!ret && iommu->dirty_page_tracking) {
1650                 ret = vfio_dma_bitmap_alloc(dma, pgsize);
1651                 if (ret)
1652                         vfio_remove_dma(iommu, dma);
1653         }
1654
1655 out_unlock:
1656         mutex_unlock(&iommu->lock);
1657         return ret;
1658 }
1659
1660 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1661                              struct vfio_domain *domain)
1662 {
1663         struct vfio_batch batch;
1664         struct vfio_domain *d = NULL;
1665         struct rb_node *n;
1666         unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1667         int ret;
1668
1669         /* Arbitrarily pick the first domain in the list for lookups */
1670         if (!list_empty(&iommu->domain_list))
1671                 d = list_first_entry(&iommu->domain_list,
1672                                      struct vfio_domain, next);
1673
1674         vfio_batch_init(&batch);
1675
1676         n = rb_first(&iommu->dma_list);
1677
1678         for (; n; n = rb_next(n)) {
1679                 struct vfio_dma *dma;
1680                 dma_addr_t iova;
1681
1682                 dma = rb_entry(n, struct vfio_dma, node);
1683                 iova = dma->iova;
1684
1685                 while (iova < dma->iova + dma->size) {
1686                         phys_addr_t phys;
1687                         size_t size;
1688
1689                         if (dma->iommu_mapped) {
1690                                 phys_addr_t p;
1691                                 dma_addr_t i;
1692
1693                                 if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1694                                         ret = -EINVAL;
1695                                         goto unwind;
1696                                 }
1697
1698                                 phys = iommu_iova_to_phys(d->domain, iova);
1699
1700                                 if (WARN_ON(!phys)) {
1701                                         iova += PAGE_SIZE;
1702                                         continue;
1703                                 }
1704
1705                                 size = PAGE_SIZE;
1706                                 p = phys + size;
1707                                 i = iova + size;
1708                                 while (i < dma->iova + dma->size &&
1709                                        p == iommu_iova_to_phys(d->domain, i)) {
1710                                         size += PAGE_SIZE;
1711                                         p += PAGE_SIZE;
1712                                         i += PAGE_SIZE;
1713                                 }
1714                         } else {
1715                                 unsigned long pfn;
1716                                 unsigned long vaddr = dma->vaddr +
1717                                                      (iova - dma->iova);
1718                                 size_t n = dma->iova + dma->size - iova;
1719                                 long npage;
1720
1721                                 npage = vfio_pin_pages_remote(dma, vaddr,
1722                                                               n >> PAGE_SHIFT,
1723                                                               &pfn, limit,
1724                                                               &batch);
1725                                 if (npage <= 0) {
1726                                         WARN_ON(!npage);
1727                                         ret = (int)npage;
1728                                         goto unwind;
1729                                 }
1730
1731                                 phys = pfn << PAGE_SHIFT;
1732                                 size = npage << PAGE_SHIFT;
1733                         }
1734
1735                         ret = iommu_map(domain->domain, iova, phys, size,
1736                                         dma->prot | IOMMU_CACHE,
1737                                         GFP_KERNEL_ACCOUNT);
1738                         if (ret) {
1739                                 if (!dma->iommu_mapped) {
1740                                         vfio_unpin_pages_remote(dma, iova,
1741                                                         phys >> PAGE_SHIFT,
1742                                                         size >> PAGE_SHIFT,
1743                                                         true);
1744                                         vfio_batch_unpin(&batch, dma);
1745                                 }
1746                                 goto unwind;
1747                         }
1748
1749                         iova += size;
1750                 }
1751         }
1752
1753         /* All dmas are now mapped, defer to second tree walk for unwind */
1754         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1755                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1756
1757                 dma->iommu_mapped = true;
1758         }
1759
1760         vfio_batch_fini(&batch);
1761         return 0;
1762
1763 unwind:
1764         for (; n; n = rb_prev(n)) {
1765                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1766                 dma_addr_t iova;
1767
1768                 if (dma->iommu_mapped) {
1769                         iommu_unmap(domain->domain, dma->iova, dma->size);
1770                         continue;
1771                 }
1772
1773                 iova = dma->iova;
1774                 while (iova < dma->iova + dma->size) {
1775                         phys_addr_t phys, p;
1776                         size_t size;
1777                         dma_addr_t i;
1778
1779                         phys = iommu_iova_to_phys(domain->domain, iova);
1780                         if (!phys) {
1781                                 iova += PAGE_SIZE;
1782                                 continue;
1783                         }
1784
1785                         size = PAGE_SIZE;
1786                         p = phys + size;
1787                         i = iova + size;
1788                         while (i < dma->iova + dma->size &&
1789                                p == iommu_iova_to_phys(domain->domain, i)) {
1790                                 size += PAGE_SIZE;
1791                                 p += PAGE_SIZE;
1792                                 i += PAGE_SIZE;
1793                         }
1794
1795                         iommu_unmap(domain->domain, iova, size);
1796                         vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1797                                                 size >> PAGE_SHIFT, true);
1798                 }
1799         }
1800
1801         vfio_batch_fini(&batch);
1802         return ret;
1803 }
1804
1805 /*
1806  * We change our unmap behavior slightly depending on whether the IOMMU
1807  * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
1808  * for practically any contiguous power-of-two mapping we give it.  This means
1809  * we don't need to look for contiguous chunks ourselves to make unmapping
1810  * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
1811  * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1812  * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1813  * hugetlbfs is in use.
1814  */
1815 static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions)
1816 {
1817         int ret, order = get_order(PAGE_SIZE * 2);
1818         struct vfio_iova *region;
1819         struct page *pages;
1820         dma_addr_t start;
1821
1822         pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1823         if (!pages)
1824                 return;
1825
1826         list_for_each_entry(region, regions, list) {
1827                 start = ALIGN(region->start, PAGE_SIZE * 2);
1828                 if (start >= region->end || (region->end - start < PAGE_SIZE * 2))
1829                         continue;
1830
1831                 ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2,
1832                                 IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE,
1833                                 GFP_KERNEL_ACCOUNT);
1834                 if (!ret) {
1835                         size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE);
1836
1837                         if (unmapped == PAGE_SIZE)
1838                                 iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE);
1839                         else
1840                                 domain->fgsp = true;
1841                 }
1842                 break;
1843         }
1844
1845         __free_pages(pages, order);
1846 }
1847
1848 static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
1849                                                  struct iommu_group *iommu_group)
1850 {
1851         struct vfio_iommu_group *g;
1852
1853         list_for_each_entry(g, &domain->group_list, next) {
1854                 if (g->iommu_group == iommu_group)
1855                         return g;
1856         }
1857
1858         return NULL;
1859 }
1860
1861 static struct vfio_iommu_group*
1862 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1863                             struct iommu_group *iommu_group)
1864 {
1865         struct vfio_iommu_group *group;
1866         struct vfio_domain *domain;
1867
1868         list_for_each_entry(domain, &iommu->domain_list, next) {
1869                 group = find_iommu_group(domain, iommu_group);
1870                 if (group)
1871                         return group;
1872         }
1873
1874         list_for_each_entry(group, &iommu->emulated_iommu_groups, next)
1875                 if (group->iommu_group == iommu_group)
1876                         return group;
1877         return NULL;
1878 }
1879
1880 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1881                                   phys_addr_t *base)
1882 {
1883         struct iommu_resv_region *region;
1884         bool ret = false;
1885
1886         list_for_each_entry(region, group_resv_regions, list) {
1887                 /*
1888                  * The presence of any 'real' MSI regions should take
1889                  * precedence over the software-managed one if the
1890                  * IOMMU driver happens to advertise both types.
1891                  */
1892                 if (region->type == IOMMU_RESV_MSI) {
1893                         ret = false;
1894                         break;
1895                 }
1896
1897                 if (region->type == IOMMU_RESV_SW_MSI) {
1898                         *base = region->start;
1899                         ret = true;
1900                 }
1901         }
1902
1903         return ret;
1904 }
1905
1906 /*
1907  * This is a helper function to insert an address range to iova list.
1908  * The list is initially created with a single entry corresponding to
1909  * the IOMMU domain geometry to which the device group is attached.
1910  * The list aperture gets modified when a new domain is added to the
1911  * container if the new aperture doesn't conflict with the current one
1912  * or with any existing dma mappings. The list is also modified to
1913  * exclude any reserved regions associated with the device group.
1914  */
1915 static int vfio_iommu_iova_insert(struct list_head *head,
1916                                   dma_addr_t start, dma_addr_t end)
1917 {
1918         struct vfio_iova *region;
1919
1920         region = kmalloc(sizeof(*region), GFP_KERNEL);
1921         if (!region)
1922                 return -ENOMEM;
1923
1924         INIT_LIST_HEAD(&region->list);
1925         region->start = start;
1926         region->end = end;
1927
1928         list_add_tail(&region->list, head);
1929         return 0;
1930 }
1931
1932 /*
1933  * Check the new iommu aperture conflicts with existing aper or with any
1934  * existing dma mappings.
1935  */
1936 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1937                                      dma_addr_t start, dma_addr_t end)
1938 {
1939         struct vfio_iova *first, *last;
1940         struct list_head *iova = &iommu->iova_list;
1941
1942         if (list_empty(iova))
1943                 return false;
1944
1945         /* Disjoint sets, return conflict */
1946         first = list_first_entry(iova, struct vfio_iova, list);
1947         last = list_last_entry(iova, struct vfio_iova, list);
1948         if (start > last->end || end < first->start)
1949                 return true;
1950
1951         /* Check for any existing dma mappings below the new start */
1952         if (start > first->start) {
1953                 if (vfio_find_dma(iommu, first->start, start - first->start))
1954                         return true;
1955         }
1956
1957         /* Check for any existing dma mappings beyond the new end */
1958         if (end < last->end) {
1959                 if (vfio_find_dma(iommu, end + 1, last->end - end))
1960                         return true;
1961         }
1962
1963         return false;
1964 }
1965
1966 /*
1967  * Resize iommu iova aperture window. This is called only if the new
1968  * aperture has no conflict with existing aperture and dma mappings.
1969  */
1970 static int vfio_iommu_aper_resize(struct list_head *iova,
1971                                   dma_addr_t start, dma_addr_t end)
1972 {
1973         struct vfio_iova *node, *next;
1974
1975         if (list_empty(iova))
1976                 return vfio_iommu_iova_insert(iova, start, end);
1977
1978         /* Adjust iova list start */
1979         list_for_each_entry_safe(node, next, iova, list) {
1980                 if (start < node->start)
1981                         break;
1982                 if (start >= node->start && start < node->end) {
1983                         node->start = start;
1984                         break;
1985                 }
1986                 /* Delete nodes before new start */
1987                 list_del(&node->list);
1988                 kfree(node);
1989         }
1990
1991         /* Adjust iova list end */
1992         list_for_each_entry_safe(node, next, iova, list) {
1993                 if (end > node->end)
1994                         continue;
1995                 if (end > node->start && end <= node->end) {
1996                         node->end = end;
1997                         continue;
1998                 }
1999                 /* Delete nodes after new end */
2000                 list_del(&node->list);
2001                 kfree(node);
2002         }
2003
2004         return 0;
2005 }
2006
2007 /*
2008  * Check reserved region conflicts with existing dma mappings
2009  */
2010 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
2011                                      struct list_head *resv_regions)
2012 {
2013         struct iommu_resv_region *region;
2014
2015         /* Check for conflict with existing dma mappings */
2016         list_for_each_entry(region, resv_regions, list) {
2017                 if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
2018                         continue;
2019
2020                 if (vfio_find_dma(iommu, region->start, region->length))
2021                         return true;
2022         }
2023
2024         return false;
2025 }
2026
2027 /*
2028  * Check iova region overlap with  reserved regions and
2029  * exclude them from the iommu iova range
2030  */
2031 static int vfio_iommu_resv_exclude(struct list_head *iova,
2032                                    struct list_head *resv_regions)
2033 {
2034         struct iommu_resv_region *resv;
2035         struct vfio_iova *n, *next;
2036
2037         list_for_each_entry(resv, resv_regions, list) {
2038                 phys_addr_t start, end;
2039
2040                 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
2041                         continue;
2042
2043                 start = resv->start;
2044                 end = resv->start + resv->length - 1;
2045
2046                 list_for_each_entry_safe(n, next, iova, list) {
2047                         int ret = 0;
2048
2049                         /* No overlap */
2050                         if (start > n->end || end < n->start)
2051                                 continue;
2052                         /*
2053                          * Insert a new node if current node overlaps with the
2054                          * reserve region to exclude that from valid iova range.
2055                          * Note that, new node is inserted before the current
2056                          * node and finally the current node is deleted keeping
2057                          * the list updated and sorted.
2058                          */
2059                         if (start > n->start)
2060                                 ret = vfio_iommu_iova_insert(&n->list, n->start,
2061                                                              start - 1);
2062                         if (!ret && end < n->end)
2063                                 ret = vfio_iommu_iova_insert(&n->list, end + 1,
2064                                                              n->end);
2065                         if (ret)
2066                                 return ret;
2067
2068                         list_del(&n->list);
2069                         kfree(n);
2070                 }
2071         }
2072
2073         if (list_empty(iova))
2074                 return -EINVAL;
2075
2076         return 0;
2077 }
2078
2079 static void vfio_iommu_resv_free(struct list_head *resv_regions)
2080 {
2081         struct iommu_resv_region *n, *next;
2082
2083         list_for_each_entry_safe(n, next, resv_regions, list) {
2084                 list_del(&n->list);
2085                 kfree(n);
2086         }
2087 }
2088
2089 static void vfio_iommu_iova_free(struct list_head *iova)
2090 {
2091         struct vfio_iova *n, *next;
2092
2093         list_for_each_entry_safe(n, next, iova, list) {
2094                 list_del(&n->list);
2095                 kfree(n);
2096         }
2097 }
2098
2099 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
2100                                     struct list_head *iova_copy)
2101 {
2102         struct list_head *iova = &iommu->iova_list;
2103         struct vfio_iova *n;
2104         int ret;
2105
2106         list_for_each_entry(n, iova, list) {
2107                 ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
2108                 if (ret)
2109                         goto out_free;
2110         }
2111
2112         return 0;
2113
2114 out_free:
2115         vfio_iommu_iova_free(iova_copy);
2116         return ret;
2117 }
2118
2119 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2120                                         struct list_head *iova_copy)
2121 {
2122         struct list_head *iova = &iommu->iova_list;
2123
2124         vfio_iommu_iova_free(iova);
2125
2126         list_splice_tail(iova_copy, iova);
2127 }
2128
2129 static int vfio_iommu_domain_alloc(struct device *dev, void *data)
2130 {
2131         struct iommu_domain **domain = data;
2132
2133         *domain = iommu_paging_domain_alloc(dev);
2134         return 1; /* Don't iterate */
2135 }
2136
2137 static int vfio_iommu_type1_attach_group(void *iommu_data,
2138                 struct iommu_group *iommu_group, enum vfio_group_type type)
2139 {
2140         struct vfio_iommu *iommu = iommu_data;
2141         struct vfio_iommu_group *group;
2142         struct vfio_domain *domain, *d;
2143         bool resv_msi;
2144         phys_addr_t resv_msi_base = 0;
2145         struct iommu_domain_geometry *geo;
2146         LIST_HEAD(iova_copy);
2147         LIST_HEAD(group_resv_regions);
2148         int ret = -EBUSY;
2149
2150         mutex_lock(&iommu->lock);
2151
2152         /* Attach could require pinning, so disallow while vaddr is invalid. */
2153         if (iommu->vaddr_invalid_count)
2154                 goto out_unlock;
2155
2156         /* Check for duplicates */
2157         ret = -EINVAL;
2158         if (vfio_iommu_find_iommu_group(iommu, iommu_group))
2159                 goto out_unlock;
2160
2161         ret = -ENOMEM;
2162         group = kzalloc(sizeof(*group), GFP_KERNEL);
2163         if (!group)
2164                 goto out_unlock;
2165         group->iommu_group = iommu_group;
2166
2167         if (type == VFIO_EMULATED_IOMMU) {
2168                 list_add(&group->next, &iommu->emulated_iommu_groups);
2169                 /*
2170                  * An emulated IOMMU group cannot dirty memory directly, it can
2171                  * only use interfaces that provide dirty tracking.
2172                  * The iommu scope can only be promoted with the addition of a
2173                  * dirty tracking group.
2174                  */
2175                 group->pinned_page_dirty_scope = true;
2176                 ret = 0;
2177                 goto out_unlock;
2178         }
2179
2180         ret = -ENOMEM;
2181         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2182         if (!domain)
2183                 goto out_free_group;
2184
2185         /*
2186          * Going via the iommu_group iterator avoids races, and trivially gives
2187          * us a representative device for the IOMMU API call. We don't actually
2188          * want to iterate beyond the first device (if any).
2189          */
2190         iommu_group_for_each_dev(iommu_group, &domain->domain,
2191                                  vfio_iommu_domain_alloc);
2192         if (IS_ERR(domain->domain)) {
2193                 ret = PTR_ERR(domain->domain);
2194                 goto out_free_domain;
2195         }
2196
2197         ret = iommu_attach_group(domain->domain, group->iommu_group);
2198         if (ret)
2199                 goto out_domain;
2200
2201         /* Get aperture info */
2202         geo = &domain->domain->geometry;
2203         if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
2204                                      geo->aperture_end)) {
2205                 ret = -EINVAL;
2206                 goto out_detach;
2207         }
2208
2209         ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2210         if (ret)
2211                 goto out_detach;
2212
2213         if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2214                 ret = -EINVAL;
2215                 goto out_detach;
2216         }
2217
2218         /*
2219          * We don't want to work on the original iova list as the list
2220          * gets modified and in case of failure we have to retain the
2221          * original list. Get a copy here.
2222          */
2223         ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2224         if (ret)
2225                 goto out_detach;
2226
2227         ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
2228                                      geo->aperture_end);
2229         if (ret)
2230                 goto out_detach;
2231
2232         ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2233         if (ret)
2234                 goto out_detach;
2235
2236         resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2237
2238         INIT_LIST_HEAD(&domain->group_list);
2239         list_add(&group->next, &domain->group_list);
2240
2241         if (!allow_unsafe_interrupts &&
2242             !iommu_group_has_isolated_msi(iommu_group)) {
2243                 pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2244                        __func__);
2245                 ret = -EPERM;
2246                 goto out_detach;
2247         }
2248
2249         /*
2250          * If the IOMMU can block non-coherent operations (ie PCIe TLPs with
2251          * no-snoop set) then VFIO always turns this feature on because on Intel
2252          * platforms it optimizes KVM to disable wbinvd emulation.
2253          */
2254         if (domain->domain->ops->enforce_cache_coherency)
2255                 domain->enforce_cache_coherency =
2256                         domain->domain->ops->enforce_cache_coherency(
2257                                 domain->domain);
2258
2259         /*
2260          * Try to match an existing compatible domain.  We don't want to
2261          * preclude an IOMMU driver supporting multiple bus_types and being
2262          * able to include different bus_types in the same IOMMU domain, so
2263          * we test whether the domains use the same iommu_ops rather than
2264          * testing if they're on the same bus_type.
2265          */
2266         list_for_each_entry(d, &iommu->domain_list, next) {
2267                 if (d->domain->ops == domain->domain->ops &&
2268                     d->enforce_cache_coherency ==
2269                             domain->enforce_cache_coherency) {
2270                         iommu_detach_group(domain->domain, group->iommu_group);
2271                         if (!iommu_attach_group(d->domain,
2272                                                 group->iommu_group)) {
2273                                 list_add(&group->next, &d->group_list);
2274                                 iommu_domain_free(domain->domain);
2275                                 kfree(domain);
2276                                 goto done;
2277                         }
2278
2279                         ret = iommu_attach_group(domain->domain,
2280                                                  group->iommu_group);
2281                         if (ret)
2282                                 goto out_domain;
2283                 }
2284         }
2285
2286         vfio_test_domain_fgsp(domain, &iova_copy);
2287
2288         /* replay mappings on new domains */
2289         ret = vfio_iommu_replay(iommu, domain);
2290         if (ret)
2291                 goto out_detach;
2292
2293         if (resv_msi) {
2294                 ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2295                 if (ret && ret != -ENODEV)
2296                         goto out_detach;
2297         }
2298
2299         list_add(&domain->next, &iommu->domain_list);
2300         vfio_update_pgsize_bitmap(iommu);
2301 done:
2302         /* Delete the old one and insert new iova list */
2303         vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2304
2305         /*
2306          * An iommu backed group can dirty memory directly and therefore
2307          * demotes the iommu scope until it declares itself dirty tracking
2308          * capable via the page pinning interface.
2309          */
2310         iommu->num_non_pinned_groups++;
2311         mutex_unlock(&iommu->lock);
2312         vfio_iommu_resv_free(&group_resv_regions);
2313
2314         return 0;
2315
2316 out_detach:
2317         iommu_detach_group(domain->domain, group->iommu_group);
2318 out_domain:
2319         iommu_domain_free(domain->domain);
2320         vfio_iommu_iova_free(&iova_copy);
2321         vfio_iommu_resv_free(&group_resv_regions);
2322 out_free_domain:
2323         kfree(domain);
2324 out_free_group:
2325         kfree(group);
2326 out_unlock:
2327         mutex_unlock(&iommu->lock);
2328         return ret;
2329 }
2330
2331 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2332 {
2333         struct rb_node *node;
2334
2335         while ((node = rb_first(&iommu->dma_list)))
2336                 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2337 }
2338
2339 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2340 {
2341         struct rb_node *n, *p;
2342
2343         n = rb_first(&iommu->dma_list);
2344         for (; n; n = rb_next(n)) {
2345                 struct vfio_dma *dma;
2346                 long locked = 0, unlocked = 0;
2347
2348                 dma = rb_entry(n, struct vfio_dma, node);
2349                 unlocked += vfio_unmap_unpin(iommu, dma, false);
2350                 p = rb_first(&dma->pfn_list);
2351                 for (; p; p = rb_next(p)) {
2352                         struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2353                                                          node);
2354
2355                         if (!is_invalid_reserved_pfn(vpfn->pfn))
2356                                 locked++;
2357                 }
2358                 vfio_lock_acct(dma, locked - unlocked, true);
2359         }
2360 }
2361
2362 /*
2363  * Called when a domain is removed in detach. It is possible that
2364  * the removed domain decided the iova aperture window. Modify the
2365  * iova aperture with the smallest window among existing domains.
2366  */
2367 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2368                                    struct list_head *iova_copy)
2369 {
2370         struct vfio_domain *domain;
2371         struct vfio_iova *node;
2372         dma_addr_t start = 0;
2373         dma_addr_t end = (dma_addr_t)~0;
2374
2375         if (list_empty(iova_copy))
2376                 return;
2377
2378         list_for_each_entry(domain, &iommu->domain_list, next) {
2379                 struct iommu_domain_geometry *geo = &domain->domain->geometry;
2380
2381                 if (geo->aperture_start > start)
2382                         start = geo->aperture_start;
2383                 if (geo->aperture_end < end)
2384                         end = geo->aperture_end;
2385         }
2386
2387         /* Modify aperture limits. The new aper is either same or bigger */
2388         node = list_first_entry(iova_copy, struct vfio_iova, list);
2389         node->start = start;
2390         node = list_last_entry(iova_copy, struct vfio_iova, list);
2391         node->end = end;
2392 }
2393
2394 /*
2395  * Called when a group is detached. The reserved regions for that
2396  * group can be part of valid iova now. But since reserved regions
2397  * may be duplicated among groups, populate the iova valid regions
2398  * list again.
2399  */
2400 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2401                                    struct list_head *iova_copy)
2402 {
2403         struct vfio_domain *d;
2404         struct vfio_iommu_group *g;
2405         struct vfio_iova *node;
2406         dma_addr_t start, end;
2407         LIST_HEAD(resv_regions);
2408         int ret;
2409
2410         if (list_empty(iova_copy))
2411                 return -EINVAL;
2412
2413         list_for_each_entry(d, &iommu->domain_list, next) {
2414                 list_for_each_entry(g, &d->group_list, next) {
2415                         ret = iommu_get_group_resv_regions(g->iommu_group,
2416                                                            &resv_regions);
2417                         if (ret)
2418                                 goto done;
2419                 }
2420         }
2421
2422         node = list_first_entry(iova_copy, struct vfio_iova, list);
2423         start = node->start;
2424         node = list_last_entry(iova_copy, struct vfio_iova, list);
2425         end = node->end;
2426
2427         /* purge the iova list and create new one */
2428         vfio_iommu_iova_free(iova_copy);
2429
2430         ret = vfio_iommu_aper_resize(iova_copy, start, end);
2431         if (ret)
2432                 goto done;
2433
2434         /* Exclude current reserved regions from iova ranges */
2435         ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2436 done:
2437         vfio_iommu_resv_free(&resv_regions);
2438         return ret;
2439 }
2440
2441 static void vfio_iommu_type1_detach_group(void *iommu_data,
2442                                           struct iommu_group *iommu_group)
2443 {
2444         struct vfio_iommu *iommu = iommu_data;
2445         struct vfio_domain *domain;
2446         struct vfio_iommu_group *group;
2447         bool update_dirty_scope = false;
2448         LIST_HEAD(iova_copy);
2449
2450         mutex_lock(&iommu->lock);
2451         list_for_each_entry(group, &iommu->emulated_iommu_groups, next) {
2452                 if (group->iommu_group != iommu_group)
2453                         continue;
2454                 update_dirty_scope = !group->pinned_page_dirty_scope;
2455                 list_del(&group->next);
2456                 kfree(group);
2457
2458                 if (list_empty(&iommu->emulated_iommu_groups) &&
2459                     list_empty(&iommu->domain_list)) {
2460                         WARN_ON(!list_empty(&iommu->device_list));
2461                         vfio_iommu_unmap_unpin_all(iommu);
2462                 }
2463                 goto detach_group_done;
2464         }
2465
2466         /*
2467          * Get a copy of iova list. This will be used to update
2468          * and to replace the current one later. Please note that
2469          * we will leave the original list as it is if update fails.
2470          */
2471         vfio_iommu_iova_get_copy(iommu, &iova_copy);
2472
2473         list_for_each_entry(domain, &iommu->domain_list, next) {
2474                 group = find_iommu_group(domain, iommu_group);
2475                 if (!group)
2476                         continue;
2477
2478                 iommu_detach_group(domain->domain, group->iommu_group);
2479                 update_dirty_scope = !group->pinned_page_dirty_scope;
2480                 list_del(&group->next);
2481                 kfree(group);
2482                 /*
2483                  * Group ownership provides privilege, if the group list is
2484                  * empty, the domain goes away. If it's the last domain with
2485                  * iommu and external domain doesn't exist, then all the
2486                  * mappings go away too. If it's the last domain with iommu and
2487                  * external domain exist, update accounting
2488                  */
2489                 if (list_empty(&domain->group_list)) {
2490                         if (list_is_singular(&iommu->domain_list)) {
2491                                 if (list_empty(&iommu->emulated_iommu_groups)) {
2492                                         WARN_ON(!list_empty(
2493                                                 &iommu->device_list));
2494                                         vfio_iommu_unmap_unpin_all(iommu);
2495                                 } else {
2496                                         vfio_iommu_unmap_unpin_reaccount(iommu);
2497                                 }
2498                         }
2499                         iommu_domain_free(domain->domain);
2500                         list_del(&domain->next);
2501                         kfree(domain);
2502                         vfio_iommu_aper_expand(iommu, &iova_copy);
2503                         vfio_update_pgsize_bitmap(iommu);
2504                 }
2505                 break;
2506         }
2507
2508         if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2509                 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2510         else
2511                 vfio_iommu_iova_free(&iova_copy);
2512
2513 detach_group_done:
2514         /*
2515          * Removal of a group without dirty tracking may allow the iommu scope
2516          * to be promoted.
2517          */
2518         if (update_dirty_scope) {
2519                 iommu->num_non_pinned_groups--;
2520                 if (iommu->dirty_page_tracking)
2521                         vfio_iommu_populate_bitmap_full(iommu);
2522         }
2523         mutex_unlock(&iommu->lock);
2524 }
2525
2526 static void *vfio_iommu_type1_open(unsigned long arg)
2527 {
2528         struct vfio_iommu *iommu;
2529
2530         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2531         if (!iommu)
2532                 return ERR_PTR(-ENOMEM);
2533
2534         switch (arg) {
2535         case VFIO_TYPE1_IOMMU:
2536                 break;
2537         case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
2538         case VFIO_TYPE1v2_IOMMU:
2539                 iommu->v2 = true;
2540                 break;
2541         default:
2542                 kfree(iommu);
2543                 return ERR_PTR(-EINVAL);
2544         }
2545
2546         INIT_LIST_HEAD(&iommu->domain_list);
2547         INIT_LIST_HEAD(&iommu->iova_list);
2548         iommu->dma_list = RB_ROOT;
2549         iommu->dma_avail = dma_entry_limit;
2550         mutex_init(&iommu->lock);
2551         mutex_init(&iommu->device_list_lock);
2552         INIT_LIST_HEAD(&iommu->device_list);
2553         iommu->pgsize_bitmap = PAGE_MASK;
2554         INIT_LIST_HEAD(&iommu->emulated_iommu_groups);
2555
2556         return iommu;
2557 }
2558
2559 static void vfio_release_domain(struct vfio_domain *domain)
2560 {
2561         struct vfio_iommu_group *group, *group_tmp;
2562
2563         list_for_each_entry_safe(group, group_tmp,
2564                                  &domain->group_list, next) {
2565                 iommu_detach_group(domain->domain, group->iommu_group);
2566                 list_del(&group->next);
2567                 kfree(group);
2568         }
2569
2570         iommu_domain_free(domain->domain);
2571 }
2572
2573 static void vfio_iommu_type1_release(void *iommu_data)
2574 {
2575         struct vfio_iommu *iommu = iommu_data;
2576         struct vfio_domain *domain, *domain_tmp;
2577         struct vfio_iommu_group *group, *next_group;
2578
2579         list_for_each_entry_safe(group, next_group,
2580                         &iommu->emulated_iommu_groups, next) {
2581                 list_del(&group->next);
2582                 kfree(group);
2583         }
2584
2585         vfio_iommu_unmap_unpin_all(iommu);
2586
2587         list_for_each_entry_safe(domain, domain_tmp,
2588                                  &iommu->domain_list, next) {
2589                 vfio_release_domain(domain);
2590                 list_del(&domain->next);
2591                 kfree(domain);
2592         }
2593
2594         vfio_iommu_iova_free(&iommu->iova_list);
2595
2596         kfree(iommu);
2597 }
2598
2599 static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu)
2600 {
2601         struct vfio_domain *domain;
2602         int ret = 1;
2603
2604         mutex_lock(&iommu->lock);
2605         list_for_each_entry(domain, &iommu->domain_list, next) {
2606                 if (!(domain->enforce_cache_coherency)) {
2607                         ret = 0;
2608                         break;
2609                 }
2610         }
2611         mutex_unlock(&iommu->lock);
2612
2613         return ret;
2614 }
2615
2616 static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu)
2617 {
2618         bool ret;
2619
2620         mutex_lock(&iommu->lock);
2621         ret = !list_empty(&iommu->emulated_iommu_groups);
2622         mutex_unlock(&iommu->lock);
2623         return ret;
2624 }
2625
2626 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2627                                             unsigned long arg)
2628 {
2629         switch (arg) {
2630         case VFIO_TYPE1_IOMMU:
2631         case VFIO_TYPE1v2_IOMMU:
2632         case VFIO_UNMAP_ALL:
2633                 return 1;
2634         case VFIO_UPDATE_VADDR:
2635                 /*
2636                  * Disable this feature if mdevs are present.  They cannot
2637                  * safely pin/unpin/rw while vaddrs are being updated.
2638                  */
2639                 return iommu && !vfio_iommu_has_emulated(iommu);
2640         case VFIO_DMA_CC_IOMMU:
2641                 if (!iommu)
2642                         return 0;
2643                 return vfio_domains_have_enforce_cache_coherency(iommu);
2644         default:
2645                 return 0;
2646         }
2647 }
2648
2649 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2650                  struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2651                  size_t size)
2652 {
2653         struct vfio_info_cap_header *header;
2654         struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2655
2656         header = vfio_info_cap_add(caps, size,
2657                                    VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2658         if (IS_ERR(header))
2659                 return PTR_ERR(header);
2660
2661         iova_cap = container_of(header,
2662                                 struct vfio_iommu_type1_info_cap_iova_range,
2663                                 header);
2664         iova_cap->nr_iovas = cap_iovas->nr_iovas;
2665         memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2666                cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2667         return 0;
2668 }
2669
2670 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2671                                       struct vfio_info_cap *caps)
2672 {
2673         struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2674         struct vfio_iova *iova;
2675         size_t size;
2676         int iovas = 0, i = 0, ret;
2677
2678         list_for_each_entry(iova, &iommu->iova_list, list)
2679                 iovas++;
2680
2681         if (!iovas) {
2682                 /*
2683                  * Return 0 as a container with a single mdev device
2684                  * will have an empty list
2685                  */
2686                 return 0;
2687         }
2688
2689         size = struct_size(cap_iovas, iova_ranges, iovas);
2690
2691         cap_iovas = kzalloc(size, GFP_KERNEL);
2692         if (!cap_iovas)
2693                 return -ENOMEM;
2694
2695         cap_iovas->nr_iovas = iovas;
2696
2697         list_for_each_entry(iova, &iommu->iova_list, list) {
2698                 cap_iovas->iova_ranges[i].start = iova->start;
2699                 cap_iovas->iova_ranges[i].end = iova->end;
2700                 i++;
2701         }
2702
2703         ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2704
2705         kfree(cap_iovas);
2706         return ret;
2707 }
2708
2709 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2710                                            struct vfio_info_cap *caps)
2711 {
2712         struct vfio_iommu_type1_info_cap_migration cap_mig = {};
2713
2714         cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2715         cap_mig.header.version = 1;
2716
2717         cap_mig.flags = 0;
2718         /* support minimum pgsize */
2719         cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2720         cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2721
2722         return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2723 }
2724
2725 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2726                                            struct vfio_info_cap *caps)
2727 {
2728         struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2729
2730         cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2731         cap_dma_avail.header.version = 1;
2732
2733         cap_dma_avail.avail = iommu->dma_avail;
2734
2735         return vfio_info_add_capability(caps, &cap_dma_avail.header,
2736                                         sizeof(cap_dma_avail));
2737 }
2738
2739 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2740                                      unsigned long arg)
2741 {
2742         struct vfio_iommu_type1_info info = {};
2743         unsigned long minsz;
2744         struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2745         int ret;
2746
2747         minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2748
2749         if (copy_from_user(&info, (void __user *)arg, minsz))
2750                 return -EFAULT;
2751
2752         if (info.argsz < minsz)
2753                 return -EINVAL;
2754
2755         minsz = min_t(size_t, info.argsz, sizeof(info));
2756
2757         mutex_lock(&iommu->lock);
2758         info.flags = VFIO_IOMMU_INFO_PGSIZES;
2759
2760         info.iova_pgsizes = iommu->pgsize_bitmap;
2761
2762         ret = vfio_iommu_migration_build_caps(iommu, &caps);
2763
2764         if (!ret)
2765                 ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2766
2767         if (!ret)
2768                 ret = vfio_iommu_iova_build_caps(iommu, &caps);
2769
2770         mutex_unlock(&iommu->lock);
2771
2772         if (ret)
2773                 return ret;
2774
2775         if (caps.size) {
2776                 info.flags |= VFIO_IOMMU_INFO_CAPS;
2777
2778                 if (info.argsz < sizeof(info) + caps.size) {
2779                         info.argsz = sizeof(info) + caps.size;
2780                 } else {
2781                         vfio_info_cap_shift(&caps, sizeof(info));
2782                         if (copy_to_user((void __user *)arg +
2783                                         sizeof(info), caps.buf,
2784                                         caps.size)) {
2785                                 kfree(caps.buf);
2786                                 return -EFAULT;
2787                         }
2788                         info.cap_offset = sizeof(info);
2789                 }
2790
2791                 kfree(caps.buf);
2792         }
2793
2794         return copy_to_user((void __user *)arg, &info, minsz) ?
2795                         -EFAULT : 0;
2796 }
2797
2798 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2799                                     unsigned long arg)
2800 {
2801         struct vfio_iommu_type1_dma_map map;
2802         unsigned long minsz;
2803         uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
2804                         VFIO_DMA_MAP_FLAG_VADDR;
2805
2806         minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2807
2808         if (copy_from_user(&map, (void __user *)arg, minsz))
2809                 return -EFAULT;
2810
2811         if (map.argsz < minsz || map.flags & ~mask)
2812                 return -EINVAL;
2813
2814         return vfio_dma_do_map(iommu, &map);
2815 }
2816
2817 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2818                                       unsigned long arg)
2819 {
2820         struct vfio_iommu_type1_dma_unmap unmap;
2821         struct vfio_bitmap bitmap = { 0 };
2822         uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
2823                         VFIO_DMA_UNMAP_FLAG_VADDR |
2824                         VFIO_DMA_UNMAP_FLAG_ALL;
2825         unsigned long minsz;
2826         int ret;
2827
2828         minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2829
2830         if (copy_from_user(&unmap, (void __user *)arg, minsz))
2831                 return -EFAULT;
2832
2833         if (unmap.argsz < minsz || unmap.flags & ~mask)
2834                 return -EINVAL;
2835
2836         if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
2837             (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
2838                             VFIO_DMA_UNMAP_FLAG_VADDR)))
2839                 return -EINVAL;
2840
2841         if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2842                 unsigned long pgshift;
2843
2844                 if (unmap.argsz < (minsz + sizeof(bitmap)))
2845                         return -EINVAL;
2846
2847                 if (copy_from_user(&bitmap,
2848                                    (void __user *)(arg + minsz),
2849                                    sizeof(bitmap)))
2850                         return -EFAULT;
2851
2852                 if (!access_ok((void __user *)bitmap.data, bitmap.size))
2853                         return -EINVAL;
2854
2855                 pgshift = __ffs(bitmap.pgsize);
2856                 ret = verify_bitmap_size(unmap.size >> pgshift,
2857                                          bitmap.size);
2858                 if (ret)
2859                         return ret;
2860         }
2861
2862         ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
2863         if (ret)
2864                 return ret;
2865
2866         return copy_to_user((void __user *)arg, &unmap, minsz) ?
2867                         -EFAULT : 0;
2868 }
2869
2870 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
2871                                         unsigned long arg)
2872 {
2873         struct vfio_iommu_type1_dirty_bitmap dirty;
2874         uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
2875                         VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
2876                         VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
2877         unsigned long minsz;
2878         int ret = 0;
2879
2880         if (!iommu->v2)
2881                 return -EACCES;
2882
2883         minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
2884
2885         if (copy_from_user(&dirty, (void __user *)arg, minsz))
2886                 return -EFAULT;
2887
2888         if (dirty.argsz < minsz || dirty.flags & ~mask)
2889                 return -EINVAL;
2890
2891         /* only one flag should be set at a time */
2892         if (__ffs(dirty.flags) != __fls(dirty.flags))
2893                 return -EINVAL;
2894
2895         if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
2896                 size_t pgsize;
2897
2898                 mutex_lock(&iommu->lock);
2899                 pgsize = 1 << __ffs(iommu->pgsize_bitmap);
2900                 if (!iommu->dirty_page_tracking) {
2901                         ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
2902                         if (!ret)
2903                                 iommu->dirty_page_tracking = true;
2904                 }
2905                 mutex_unlock(&iommu->lock);
2906                 return ret;
2907         } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
2908                 mutex_lock(&iommu->lock);
2909                 if (iommu->dirty_page_tracking) {
2910                         iommu->dirty_page_tracking = false;
2911                         vfio_dma_bitmap_free_all(iommu);
2912                 }
2913                 mutex_unlock(&iommu->lock);
2914                 return 0;
2915         } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
2916                 struct vfio_iommu_type1_dirty_bitmap_get range;
2917                 unsigned long pgshift;
2918                 size_t data_size = dirty.argsz - minsz;
2919                 size_t iommu_pgsize;
2920
2921                 if (!data_size || data_size < sizeof(range))
2922                         return -EINVAL;
2923
2924                 if (copy_from_user(&range, (void __user *)(arg + minsz),
2925                                    sizeof(range)))
2926                         return -EFAULT;
2927
2928                 if (range.iova + range.size < range.iova)
2929                         return -EINVAL;
2930                 if (!access_ok((void __user *)range.bitmap.data,
2931                                range.bitmap.size))
2932                         return -EINVAL;
2933
2934                 pgshift = __ffs(range.bitmap.pgsize);
2935                 ret = verify_bitmap_size(range.size >> pgshift,
2936                                          range.bitmap.size);
2937                 if (ret)
2938                         return ret;
2939
2940                 mutex_lock(&iommu->lock);
2941
2942                 iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2943
2944                 /* allow only smallest supported pgsize */
2945                 if (range.bitmap.pgsize != iommu_pgsize) {
2946                         ret = -EINVAL;
2947                         goto out_unlock;
2948                 }
2949                 if (range.iova & (iommu_pgsize - 1)) {
2950                         ret = -EINVAL;
2951                         goto out_unlock;
2952                 }
2953                 if (!range.size || range.size & (iommu_pgsize - 1)) {
2954                         ret = -EINVAL;
2955                         goto out_unlock;
2956                 }
2957
2958                 if (iommu->dirty_page_tracking)
2959                         ret = vfio_iova_dirty_bitmap(range.bitmap.data,
2960                                                      iommu, range.iova,
2961                                                      range.size,
2962                                                      range.bitmap.pgsize);
2963                 else
2964                         ret = -EINVAL;
2965 out_unlock:
2966                 mutex_unlock(&iommu->lock);
2967
2968                 return ret;
2969         }
2970
2971         return -EINVAL;
2972 }
2973
2974 static long vfio_iommu_type1_ioctl(void *iommu_data,
2975                                    unsigned int cmd, unsigned long arg)
2976 {
2977         struct vfio_iommu *iommu = iommu_data;
2978
2979         switch (cmd) {
2980         case VFIO_CHECK_EXTENSION:
2981                 return vfio_iommu_type1_check_extension(iommu, arg);
2982         case VFIO_IOMMU_GET_INFO:
2983                 return vfio_iommu_type1_get_info(iommu, arg);
2984         case VFIO_IOMMU_MAP_DMA:
2985                 return vfio_iommu_type1_map_dma(iommu, arg);
2986         case VFIO_IOMMU_UNMAP_DMA:
2987                 return vfio_iommu_type1_unmap_dma(iommu, arg);
2988         case VFIO_IOMMU_DIRTY_PAGES:
2989                 return vfio_iommu_type1_dirty_pages(iommu, arg);
2990         default:
2991                 return -ENOTTY;
2992         }
2993 }
2994
2995 static void vfio_iommu_type1_register_device(void *iommu_data,
2996                                              struct vfio_device *vdev)
2997 {
2998         struct vfio_iommu *iommu = iommu_data;
2999
3000         if (!vdev->ops->dma_unmap)
3001                 return;
3002
3003         /*
3004          * list_empty(&iommu->device_list) is tested under the iommu->lock while
3005          * iteration for dma_unmap must be done under the device_list_lock.
3006          * Holding both locks here allows avoiding the device_list_lock in
3007          * several fast paths. See vfio_notify_dma_unmap()
3008          */
3009         mutex_lock(&iommu->lock);
3010         mutex_lock(&iommu->device_list_lock);
3011         list_add(&vdev->iommu_entry, &iommu->device_list);
3012         mutex_unlock(&iommu->device_list_lock);
3013         mutex_unlock(&iommu->lock);
3014 }
3015
3016 static void vfio_iommu_type1_unregister_device(void *iommu_data,
3017                                                struct vfio_device *vdev)
3018 {
3019         struct vfio_iommu *iommu = iommu_data;
3020
3021         if (!vdev->ops->dma_unmap)
3022                 return;
3023
3024         mutex_lock(&iommu->lock);
3025         mutex_lock(&iommu->device_list_lock);
3026         list_del(&vdev->iommu_entry);
3027         mutex_unlock(&iommu->device_list_lock);
3028         mutex_unlock(&iommu->lock);
3029 }
3030
3031 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
3032                                          dma_addr_t user_iova, void *data,
3033                                          size_t count, bool write,
3034                                          size_t *copied)
3035 {
3036         struct mm_struct *mm;
3037         unsigned long vaddr;
3038         struct vfio_dma *dma;
3039         bool kthread = current->mm == NULL;
3040         size_t offset;
3041
3042         *copied = 0;
3043
3044         dma = vfio_find_dma(iommu, user_iova, 1);
3045         if (!dma)
3046                 return -EINVAL;
3047
3048         if ((write && !(dma->prot & IOMMU_WRITE)) ||
3049                         !(dma->prot & IOMMU_READ))
3050                 return -EPERM;
3051
3052         mm = dma->mm;
3053         if (!mmget_not_zero(mm))
3054                 return -EPERM;
3055
3056         if (kthread)
3057                 kthread_use_mm(mm);
3058         else if (current->mm != mm)
3059                 goto out;
3060
3061         offset = user_iova - dma->iova;
3062
3063         if (count > dma->size - offset)
3064                 count = dma->size - offset;
3065
3066         vaddr = dma->vaddr + offset;
3067
3068         if (write) {
3069                 *copied = copy_to_user((void __user *)vaddr, data,
3070                                          count) ? 0 : count;
3071                 if (*copied && iommu->dirty_page_tracking) {
3072                         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
3073                         /*
3074                          * Bitmap populated with the smallest supported page
3075                          * size
3076                          */
3077                         bitmap_set(dma->bitmap, offset >> pgshift,
3078                                    ((offset + *copied - 1) >> pgshift) -
3079                                    (offset >> pgshift) + 1);
3080                 }
3081         } else
3082                 *copied = copy_from_user(data, (void __user *)vaddr,
3083                                            count) ? 0 : count;
3084         if (kthread)
3085                 kthread_unuse_mm(mm);
3086 out:
3087         mmput(mm);
3088         return *copied ? 0 : -EFAULT;
3089 }
3090
3091 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
3092                                    void *data, size_t count, bool write)
3093 {
3094         struct vfio_iommu *iommu = iommu_data;
3095         int ret = 0;
3096         size_t done;
3097
3098         mutex_lock(&iommu->lock);
3099
3100         if (WARN_ONCE(iommu->vaddr_invalid_count,
3101                       "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) {
3102                 ret = -EBUSY;
3103                 goto out;
3104         }
3105
3106         while (count > 0) {
3107                 ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
3108                                                     count, write, &done);
3109                 if (ret)
3110                         break;
3111
3112                 count -= done;
3113                 data += done;
3114                 user_iova += done;
3115         }
3116
3117 out:
3118         mutex_unlock(&iommu->lock);
3119         return ret;
3120 }
3121
3122 static struct iommu_domain *
3123 vfio_iommu_type1_group_iommu_domain(void *iommu_data,
3124                                     struct iommu_group *iommu_group)
3125 {
3126         struct iommu_domain *domain = ERR_PTR(-ENODEV);
3127         struct vfio_iommu *iommu = iommu_data;
3128         struct vfio_domain *d;
3129
3130         if (!iommu || !iommu_group)
3131                 return ERR_PTR(-EINVAL);
3132
3133         mutex_lock(&iommu->lock);
3134         list_for_each_entry(d, &iommu->domain_list, next) {
3135                 if (find_iommu_group(d, iommu_group)) {
3136                         domain = d->domain;
3137                         break;
3138                 }
3139         }
3140         mutex_unlock(&iommu->lock);
3141
3142         return domain;
3143 }
3144
3145 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
3146         .name                   = "vfio-iommu-type1",
3147         .owner                  = THIS_MODULE,
3148         .open                   = vfio_iommu_type1_open,
3149         .release                = vfio_iommu_type1_release,
3150         .ioctl                  = vfio_iommu_type1_ioctl,
3151         .attach_group           = vfio_iommu_type1_attach_group,
3152         .detach_group           = vfio_iommu_type1_detach_group,
3153         .pin_pages              = vfio_iommu_type1_pin_pages,
3154         .unpin_pages            = vfio_iommu_type1_unpin_pages,
3155         .register_device        = vfio_iommu_type1_register_device,
3156         .unregister_device      = vfio_iommu_type1_unregister_device,
3157         .dma_rw                 = vfio_iommu_type1_dma_rw,
3158         .group_iommu_domain     = vfio_iommu_type1_group_iommu_domain,
3159 };
3160
3161 static int __init vfio_iommu_type1_init(void)
3162 {
3163         return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
3164 }
3165
3166 static void __exit vfio_iommu_type1_cleanup(void)
3167 {
3168         vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3169 }
3170
3171 module_init(vfio_iommu_type1_init);
3172 module_exit(vfio_iommu_type1_cleanup);
3173
3174 MODULE_VERSION(DRIVER_VERSION);
3175 MODULE_LICENSE("GPL v2");
3176 MODULE_AUTHOR(DRIVER_AUTHOR);
3177 MODULE_DESCRIPTION(DRIVER_DESC);