drivers/vfio/vfio.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * VFIO core
   4  *
   5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6  *     Author: Alex Williamson <[email protected]>
   7  *
   8  * Derived from original vfio:
   9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10  * Author: Tom Lyon, [email protected]
  11  */
  12
  13 #include <linux/cdev.h>
  14 #include <linux/compat.h>
  15 #include <linux/device.h>
  16 #include <linux/file.h>
  17 #include <linux/anon_inodes.h>
  18 #include <linux/fs.h>
  19 #include <linux/idr.h>
  20 #include <linux/iommu.h>
  21 #include <linux/list.h>
  22 #include <linux/miscdevice.h>
  23 #include <linux/module.h>
  24 #include <linux/mutex.h>
  25 #include <linux/pci.h>
  26 #include <linux/rwsem.h>
  27 #include <linux/sched.h>
  28 #include <linux/slab.h>
  29 #include <linux/stat.h>
  30 #include <linux/string.h>
  31 #include <linux/uaccess.h>
  32 #include <linux/vfio.h>
  33 #include <linux/wait.h>
  34 #include <linux/sched/signal.h>
  35
  36 #define DRIVER_VERSION  "0.3"
  37 #define DRIVER_AUTHOR   "Alex Williamson <[email protected]>"
  38 #define DRIVER_DESC     "VFIO - User Level meta-driver"
  39
  40 static struct vfio {
  41         struct class                    *class;
  42         struct list_head                iommu_drivers_list;
  43         struct mutex                    iommu_drivers_lock;
  44         struct list_head                group_list;
  45         struct idr                      group_idr;
  46         struct mutex                    group_lock;
  47         struct cdev                     group_cdev;
  48         dev_t                           group_devt;
  49         wait_queue_head_t               release_q;
  50 } vfio;
  51
  52 struct vfio_iommu_driver {
  53         const struct vfio_iommu_driver_ops      *ops;
  54         struct list_head                        vfio_next;
  55 };
  56
  57 struct vfio_container {
  58         struct kref                     kref;
  59         struct list_head                group_list;
  60         struct rw_semaphore             group_lock;
  61         struct vfio_iommu_driver        *iommu_driver;
  62         void                            *iommu_data;
  63         bool                            noiommu;
  64 };
  65
  66 struct vfio_unbound_dev {
  67         struct device                   *dev;
  68         struct list_head                unbound_next;
  69 };
  70
  71 struct vfio_group {
  72         struct kref                     kref;
  73         int                             minor;
  74         atomic_t                        container_users;
  75         struct iommu_group              *iommu_group;
  76         struct vfio_container           *container;
  77         struct list_head                device_list;
  78         struct mutex                    device_lock;
  79         struct device                   *dev;
  80         struct notifier_block           nb;
  81         struct list_head                vfio_next;
  82         struct list_head                container_next;
  83         struct list_head                unbound_list;
  84         struct mutex                    unbound_lock;
  85         atomic_t                        opened;
  86         wait_queue_head_t               container_q;
  87         bool                            noiommu;
  88         struct kvm                      *kvm;
  89         struct blocking_notifier_head   notifier;
  90 };
  91
  92 struct vfio_device {
  93         struct kref                     kref;
  94         struct device                   *dev;
  95         const struct vfio_device_ops    *ops;
  96         struct vfio_group               *group;
  97         struct list_head                group_next;
  98         void                            *device_data;
  99 };
 100
 101 #ifdef CONFIG_VFIO_NOIOMMU
 102 static bool noiommu __read_mostly;
 103 module_param_named(enable_unsafe_noiommu_mode,
 104                    noiommu, bool, S_IRUGO | S_IWUSR);
 105 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 106 #endif
 107
 108 /*
 109  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 110  * and remove functions, any use cases other than acquiring the first
 111  * reference for the purpose of calling vfio_add_group_dev() or removing
 112  * that symmetric reference after vfio_del_group_dev() should use the raw
 113  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 114  * removes the device from the dummy group and cannot be nested.
 115  */
 116 struct iommu_group *vfio_iommu_group_get(struct device *dev)
 117 {
 118         struct iommu_group *group;
 119         int __maybe_unused ret;
 120
 121         group = iommu_group_get(dev);
 122
 123 #ifdef CONFIG_VFIO_NOIOMMU
 124         /*
 125          * With noiommu enabled, an IOMMU group will be created for a device
 126          * that doesn't already have one and doesn't have an iommu_ops on their
 127          * bus.  We set iommudata simply to be able to identify these groups
 128          * as special use and for reclamation later.
 129          */
 130         if (group || !noiommu || iommu_present(dev->bus))
 131                 return group;
 132
 133         group = iommu_group_alloc();
 134         if (IS_ERR(group))
 135                 return NULL;
 136
 137         iommu_group_set_name(group, "vfio-noiommu");
 138         iommu_group_set_iommudata(group, &noiommu, NULL);
 139         ret = iommu_group_add_device(group, dev);
 140         if (ret) {
 141                 iommu_group_put(group);
 142                 return NULL;
 143         }
 144
 145         /*
 146          * Where to taint?  At this point we've added an IOMMU group for a
 147          * device that is not backed by iommu_ops, therefore any iommu_
 148          * callback using iommu_ops can legitimately Oops.  So, while we may
 149          * be about to give a DMA capable device to a user without IOMMU
 150          * protection, which is clearly taint-worthy, let's go ahead and do
 151          * it here.
 152          */
 153         add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154         dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155 #endif
 156
 157         return group;
 158 }
 159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160
 161 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162 {
 163 #ifdef CONFIG_VFIO_NOIOMMU
 164         if (iommu_group_get_iommudata(group) == &noiommu)
 165                 iommu_group_remove_device(dev);
 166 #endif
 167
 168         iommu_group_put(group);
 169 }
 170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171
 172 #ifdef CONFIG_VFIO_NOIOMMU
 173 static void *vfio_noiommu_open(unsigned long arg)
 174 {
 175         if (arg != VFIO_NOIOMMU_IOMMU)
 176                 return ERR_PTR(-EINVAL);
 177         if (!capable(CAP_SYS_RAWIO))
 178                 return ERR_PTR(-EPERM);
 179
 180         return NULL;
 181 }
 182
 183 static void vfio_noiommu_release(void *iommu_data)
 184 {
 185 }
 186
 187 static long vfio_noiommu_ioctl(void *iommu_data,
 188                                unsigned int cmd, unsigned long arg)
 189 {
 190         if (cmd == VFIO_CHECK_EXTENSION)
 191                 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192
 193         return -ENOTTY;
 194 }
 195
 196 static int vfio_noiommu_attach_group(void *iommu_data,
 197                                      struct iommu_group *iommu_group)
 198 {
 199         return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200 }
 201
 202 static void vfio_noiommu_detach_group(void *iommu_data,
 203                                       struct iommu_group *iommu_group)
 204 {
 205 }
 206
 207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208         .name = "vfio-noiommu",
 209         .owner = THIS_MODULE,
 210         .open = vfio_noiommu_open,
 211         .release = vfio_noiommu_release,
 212         .ioctl = vfio_noiommu_ioctl,
 213         .attach_group = vfio_noiommu_attach_group,
 214         .detach_group = vfio_noiommu_detach_group,
 215 };
 216 #endif
 217
 218
 219 /**
 220  * IOMMU driver registration
 221  */
 222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223 {
 224         struct vfio_iommu_driver *driver, *tmp;
 225
 226         driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227         if (!driver)
 228                 return -ENOMEM;
 229
 230         driver->ops = ops;
 231
 232         mutex_lock(&vfio.iommu_drivers_lock);
 233
 234         /* Check for duplicates */
 235         list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236                 if (tmp->ops == ops) {
 237                         mutex_unlock(&vfio.iommu_drivers_lock);
 238                         kfree(driver);
 239                         return -EINVAL;
 240                 }
 241         }
 242
 243         list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244
 245         mutex_unlock(&vfio.iommu_drivers_lock);
 246
 247         return 0;
 248 }
 249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250
 251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252 {
 253         struct vfio_iommu_driver *driver;
 254
 255         mutex_lock(&vfio.iommu_drivers_lock);
 256         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257                 if (driver->ops == ops) {
 258                         list_del(&driver->vfio_next);
 259                         mutex_unlock(&vfio.iommu_drivers_lock);
 260                         kfree(driver);
 261                         return;
 262                 }
 263         }
 264         mutex_unlock(&vfio.iommu_drivers_lock);
 265 }
 266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267
 268 /**
 269  * Group minor allocation/free - both called with vfio.group_lock held
 270  */
 271 static int vfio_alloc_group_minor(struct vfio_group *group)
 272 {
 273         return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274 }
 275
 276 static void vfio_free_group_minor(int minor)
 277 {
 278         idr_remove(&vfio.group_idr, minor);
 279 }
 280
 281 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282                                      unsigned long action, void *data);
 283 static void vfio_group_get(struct vfio_group *group);
 284
 285 /**
 286  * Container objects - containers are created when /dev/vfio/vfio is
 287  * opened, but their lifecycle extends until the last user is done, so
 288  * it's freed via kref.  Must support container/group/device being
 289  * closed in any order.
 290  */
 291 static void vfio_container_get(struct vfio_container *container)
 292 {
 293         kref_get(&container->kref);
 294 }
 295
 296 static void vfio_container_release(struct kref *kref)
 297 {
 298         struct vfio_container *container;
 299         container = container_of(kref, struct vfio_container, kref);
 300
 301         kfree(container);
 302 }
 303
 304 static void vfio_container_put(struct vfio_container *container)
 305 {
 306         kref_put(&container->kref, vfio_container_release);
 307 }
 308
 309 static void vfio_group_unlock_and_free(struct vfio_group *group)
 310 {
 311         mutex_unlock(&vfio.group_lock);
 312         /*
 313          * Unregister outside of lock.  A spurious callback is harmless now
 314          * that the group is no longer in vfio.group_list.
 315          */
 316         iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317         kfree(group);
 318 }
 319
 320 /**
 321  * Group objects - create, release, get, put, search
 322  */
 323 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324 {
 325         struct vfio_group *group, *tmp;
 326         struct device *dev;
 327         int ret, minor;
 328
 329         group = kzalloc(sizeof(*group), GFP_KERNEL);
 330         if (!group)
 331                 return ERR_PTR(-ENOMEM);
 332
 333         kref_init(&group->kref);
 334         INIT_LIST_HEAD(&group->device_list);
 335         mutex_init(&group->device_lock);
 336         INIT_LIST_HEAD(&group->unbound_list);
 337         mutex_init(&group->unbound_lock);
 338         atomic_set(&group->container_users, 0);
 339         atomic_set(&group->opened, 0);
 340         init_waitqueue_head(&group->container_q);
 341         group->iommu_group = iommu_group;
 342 #ifdef CONFIG_VFIO_NOIOMMU
 343         group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 344 #endif
 345         BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 346
 347         group->nb.notifier_call = vfio_iommu_group_notifier;
 348
 349         /*
 350          * blocking notifiers acquire a rwsem around registering and hold
 351          * it around callback.  Therefore, need to register outside of
 352          * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 353          * do anything unless it can find the group in vfio.group_list, so
 354          * no harm in registering early.
 355          */
 356         ret = iommu_group_register_notifier(iommu_group, &group->nb);
 357         if (ret) {
 358                 kfree(group);
 359                 return ERR_PTR(ret);
 360         }
 361
 362         mutex_lock(&vfio.group_lock);
 363
 364         /* Did we race creating this group? */
 365         list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 366                 if (tmp->iommu_group == iommu_group) {
 367                         vfio_group_get(tmp);
 368                         vfio_group_unlock_and_free(group);
 369                         return tmp;
 370                 }
 371         }
 372
 373         minor = vfio_alloc_group_minor(group);
 374         if (minor < 0) {
 375                 vfio_group_unlock_and_free(group);
 376                 return ERR_PTR(minor);
 377         }
 378
 379         dev = device_create(vfio.class, NULL,
 380                             MKDEV(MAJOR(vfio.group_devt), minor),
 381                             group, "%s%d", group->noiommu ? "noiommu-" : "",
 382                             iommu_group_id(iommu_group));
 383         if (IS_ERR(dev)) {
 384                 vfio_free_group_minor(minor);
 385                 vfio_group_unlock_and_free(group);
 386                 return ERR_CAST(dev);
 387         }
 388
 389         group->minor = minor;
 390         group->dev = dev;
 391
 392         list_add(&group->vfio_next, &vfio.group_list);
 393
 394         mutex_unlock(&vfio.group_lock);
 395
 396         return group;
 397 }
 398
 399 /* called with vfio.group_lock held */
 400 static void vfio_group_release(struct kref *kref)
 401 {
 402         struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 403         struct vfio_unbound_dev *unbound, *tmp;
 404         struct iommu_group *iommu_group = group->iommu_group;
 405
 406         WARN_ON(!list_empty(&group->device_list));
 407         WARN_ON(group->notifier.head);
 408
 409         list_for_each_entry_safe(unbound, tmp,
 410                                  &group->unbound_list, unbound_next) {
 411                 list_del(&unbound->unbound_next);
 412                 kfree(unbound);
 413         }
 414
 415         device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 416         list_del(&group->vfio_next);
 417         vfio_free_group_minor(group->minor);
 418         vfio_group_unlock_and_free(group);
 419         iommu_group_put(iommu_group);
 420 }
 421
 422 static void vfio_group_put(struct vfio_group *group)
 423 {
 424         kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 425 }
 426
 427 struct vfio_group_put_work {
 428         struct work_struct work;
 429         struct vfio_group *group;
 430 };
 431
 432 static void vfio_group_put_bg(struct work_struct *work)
 433 {
 434         struct vfio_group_put_work *do_work;
 435
 436         do_work = container_of(work, struct vfio_group_put_work, work);
 437
 438         vfio_group_put(do_work->group);
 439         kfree(do_work);
 440 }
 441
 442 static void vfio_group_schedule_put(struct vfio_group *group)
 443 {
 444         struct vfio_group_put_work *do_work;
 445
 446         do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 447         if (WARN_ON(!do_work))
 448                 return;
 449
 450         INIT_WORK(&do_work->work, vfio_group_put_bg);
 451         do_work->group = group;
 452         schedule_work(&do_work->work);
 453 }
 454
 455 /* Assume group_lock or group reference is held */
 456 static void vfio_group_get(struct vfio_group *group)
 457 {
 458         kref_get(&group->kref);
 459 }
 460
 461 /*
 462  * Not really a try as we will sleep for mutex, but we need to make
 463  * sure the group pointer is valid under lock and get a reference.
 464  */
 465 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 466 {
 467         struct vfio_group *target = group;
 468
 469         mutex_lock(&vfio.group_lock);
 470         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 471                 if (group == target) {
 472                         vfio_group_get(group);
 473                         mutex_unlock(&vfio.group_lock);
 474                         return group;
 475                 }
 476         }
 477         mutex_unlock(&vfio.group_lock);
 478
 479         return NULL;
 480 }
 481
 482 static
 483 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 484 {
 485         struct vfio_group *group;
 486
 487         mutex_lock(&vfio.group_lock);
 488         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 489                 if (group->iommu_group == iommu_group) {
 490                         vfio_group_get(group);
 491                         mutex_unlock(&vfio.group_lock);
 492                         return group;
 493                 }
 494         }
 495         mutex_unlock(&vfio.group_lock);
 496
 497         return NULL;
 498 }
 499
 500 static struct vfio_group *vfio_group_get_from_minor(int minor)
 501 {
 502         struct vfio_group *group;
 503
 504         mutex_lock(&vfio.group_lock);
 505         group = idr_find(&vfio.group_idr, minor);
 506         if (!group) {
 507                 mutex_unlock(&vfio.group_lock);
 508                 return NULL;
 509         }
 510         vfio_group_get(group);
 511         mutex_unlock(&vfio.group_lock);
 512
 513         return group;
 514 }
 515
 516 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 517 {
 518         struct iommu_group *iommu_group;
 519         struct vfio_group *group;
 520
 521         iommu_group = iommu_group_get(dev);
 522         if (!iommu_group)
 523                 return NULL;
 524
 525         group = vfio_group_get_from_iommu(iommu_group);
 526         iommu_group_put(iommu_group);
 527
 528         return group;
 529 }
 530
 531 /**
 532  * Device objects - create, release, get, put, search
 533  */
 534 static
 535 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 536                                              struct device *dev,
 537                                              const struct vfio_device_ops *ops,
 538                                              void *device_data)
 539 {
 540         struct vfio_device *device;
 541
 542         device = kzalloc(sizeof(*device), GFP_KERNEL);
 543         if (!device)
 544                 return ERR_PTR(-ENOMEM);
 545
 546         kref_init(&device->kref);
 547         device->dev = dev;
 548         device->group = group;
 549         device->ops = ops;
 550         device->device_data = device_data;
 551         dev_set_drvdata(dev, device);
 552
 553         /* No need to get group_lock, caller has group reference */
 554         vfio_group_get(group);
 555
 556         mutex_lock(&group->device_lock);
 557         list_add(&device->group_next, &group->device_list);
 558         mutex_unlock(&group->device_lock);
 559
 560         return device;
 561 }
 562
 563 static void vfio_device_release(struct kref *kref)
 564 {
 565         struct vfio_device *device = container_of(kref,
 566                                                   struct vfio_device, kref);
 567         struct vfio_group *group = device->group;
 568
 569         list_del(&device->group_next);
 570         mutex_unlock(&group->device_lock);
 571
 572         dev_set_drvdata(device->dev, NULL);
 573
 574         kfree(device);
 575
 576         /* vfio_del_group_dev may be waiting for this device */
 577         wake_up(&vfio.release_q);
 578 }
 579
 580 /* Device reference always implies a group reference */
 581 void vfio_device_put(struct vfio_device *device)
 582 {
 583         struct vfio_group *group = device->group;
 584         kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 585         vfio_group_put(group);
 586 }
 587 EXPORT_SYMBOL_GPL(vfio_device_put);
 588
 589 static void vfio_device_get(struct vfio_device *device)
 590 {
 591         vfio_group_get(device->group);
 592         kref_get(&device->kref);
 593 }
 594
 595 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 596                                                  struct device *dev)
 597 {
 598         struct vfio_device *device;
 599
 600         mutex_lock(&group->device_lock);
 601         list_for_each_entry(device, &group->device_list, group_next) {
 602                 if (device->dev == dev) {
 603                         vfio_device_get(device);
 604                         mutex_unlock(&group->device_lock);
 605                         return device;
 606                 }
 607         }
 608         mutex_unlock(&group->device_lock);
 609         return NULL;
 610 }
 611
 612 /*
 613  * Some drivers, like pci-stub, are only used to prevent other drivers from
 614  * claiming a device and are therefore perfectly legitimate for a user owned
 615  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 616  * of the device, but it does prevent the user from having direct access to
 617  * the device, which is useful in some circumstances.
 618  *
 619  * We also assume that we can include PCI interconnect devices, ie. bridges.
 620  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 621  * then all of the downstream devices will be part of the same IOMMU group as
 622  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 623  * breaks anything, it only does so for user owned devices downstream.  Note
 624  * that error notification via MSI can be affected for platforms that handle
 625  * MSI within the same IOVA space as DMA.
 626  */
 627 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 628
 629 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 630 {
 631         if (dev_is_pci(dev)) {
 632                 struct pci_dev *pdev = to_pci_dev(dev);
 633
 634                 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 635                         return true;
 636         }
 637
 638         return match_string(vfio_driver_whitelist,
 639                             ARRAY_SIZE(vfio_driver_whitelist),
 640                             drv->name) >= 0;
 641 }
 642
 643 /*
 644  * A vfio group is viable for use by userspace if all devices are in
 645  * one of the following states:
 646  *  - driver-less
 647  *  - bound to a vfio driver
 648  *  - bound to a whitelisted driver
 649  *  - a PCI interconnect device
 650  *
 651  * We use two methods to determine whether a device is bound to a vfio
 652  * driver.  The first is to test whether the device exists in the vfio
 653  * group.  The second is to test if the device exists on the group
 654  * unbound_list, indicating it's in the middle of transitioning from
 655  * a vfio driver to driver-less.
 656  */
 657 static int vfio_dev_viable(struct device *dev, void *data)
 658 {
 659         struct vfio_group *group = data;
 660         struct vfio_device *device;
 661         struct device_driver *drv = READ_ONCE(dev->driver);
 662         struct vfio_unbound_dev *unbound;
 663         int ret = -EINVAL;
 664
 665         mutex_lock(&group->unbound_lock);
 666         list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 667                 if (dev == unbound->dev) {
 668                         ret = 0;
 669                         break;
 670                 }
 671         }
 672         mutex_unlock(&group->unbound_lock);
 673
 674         if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 675                 return 0;
 676
 677         device = vfio_group_get_device(group, dev);
 678         if (device) {
 679                 vfio_device_put(device);
 680                 return 0;
 681         }
 682
 683         return ret;
 684 }
 685
 686 /**
 687  * Async device support
 688  */
 689 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 690 {
 691         struct vfio_device *device;
 692
 693         /* Do we already know about it?  We shouldn't */
 694         device = vfio_group_get_device(group, dev);
 695         if (WARN_ON_ONCE(device)) {
 696                 vfio_device_put(device);
 697                 return 0;
 698         }
 699
 700         /* Nothing to do for idle groups */
 701         if (!atomic_read(&group->container_users))
 702                 return 0;
 703
 704         /* TODO Prevent device auto probing */
 705         dev_WARN(dev, "Device added to live group %d!\n",
 706                  iommu_group_id(group->iommu_group));
 707
 708         return 0;
 709 }
 710
 711 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 712 {
 713         /* We don't care what happens when the group isn't in use */
 714         if (!atomic_read(&group->container_users))
 715                 return 0;
 716
 717         return vfio_dev_viable(dev, group);
 718 }
 719
 720 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 721                                      unsigned long action, void *data)
 722 {
 723         struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 724         struct device *dev = data;
 725         struct vfio_unbound_dev *unbound;
 726
 727         /*
 728          * Need to go through a group_lock lookup to get a reference or we
 729          * risk racing a group being removed.  Ignore spurious notifies.
 730          */
 731         group = vfio_group_try_get(group);
 732         if (!group)
 733                 return NOTIFY_OK;
 734
 735         switch (action) {
 736         case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 737                 vfio_group_nb_add_dev(group, dev);
 738                 break;
 739         case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 740                 /*
 741                  * Nothing to do here.  If the device is in use, then the
 742                  * vfio sub-driver should block the remove callback until
 743                  * it is unused.  If the device is unused or attached to a
 744                  * stub driver, then it should be released and we don't
 745                  * care that it will be going away.
 746                  */
 747                 break;
 748         case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 749                 dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 750                         iommu_group_id(group->iommu_group));
 751                 break;
 752         case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 753                 dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 754                         iommu_group_id(group->iommu_group), dev->driver->name);
 755                 BUG_ON(vfio_group_nb_verify(group, dev));
 756                 break;
 757         case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 758                 dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 759                         __func__, iommu_group_id(group->iommu_group),
 760                         dev->driver->name);
 761                 break;
 762         case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 763                 dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 764                         iommu_group_id(group->iommu_group));
 765                 /*
 766                  * XXX An unbound device in a live group is ok, but we'd
 767                  * really like to avoid the above BUG_ON by preventing other
 768                  * drivers from binding to it.  Once that occurs, we have to
 769                  * stop the system to maintain isolation.  At a minimum, we'd
 770                  * want a toggle to disable driver auto probe for this device.
 771                  */
 772
 773                 mutex_lock(&group->unbound_lock);
 774                 list_for_each_entry(unbound,
 775                                     &group->unbound_list, unbound_next) {
 776                         if (dev == unbound->dev) {
 777                                 list_del(&unbound->unbound_next);
 778                                 kfree(unbound);
 779                                 break;
 780                         }
 781                 }
 782                 mutex_unlock(&group->unbound_lock);
 783                 break;
 784         }
 785
 786         /*
 787          * If we're the last reference to the group, the group will be
 788          * released, which includes unregistering the iommu group notifier.
 789          * We hold a read-lock on that notifier list, unregistering needs
 790          * a write-lock... deadlock.  Release our reference asynchronously
 791          * to avoid that situation.
 792          */
 793         vfio_group_schedule_put(group);
 794         return NOTIFY_OK;
 795 }
 796
 797 /**
 798  * VFIO driver API
 799  */
 800 int vfio_add_group_dev(struct device *dev,
 801                        const struct vfio_device_ops *ops, void *device_data)
 802 {
 803         struct iommu_group *iommu_group;
 804         struct vfio_group *group;
 805         struct vfio_device *device;
 806
 807         iommu_group = iommu_group_get(dev);
 808         if (!iommu_group)
 809                 return -EINVAL;
 810
 811         group = vfio_group_get_from_iommu(iommu_group);
 812         if (!group) {
 813                 group = vfio_create_group(iommu_group);
 814                 if (IS_ERR(group)) {
 815                         iommu_group_put(iommu_group);
 816                         return PTR_ERR(group);
 817                 }
 818         } else {
 819                 /*
 820                  * A found vfio_group already holds a reference to the
 821                  * iommu_group.  A created vfio_group keeps the reference.
 822                  */
 823                 iommu_group_put(iommu_group);
 824         }
 825
 826         device = vfio_group_get_device(group, dev);
 827         if (device) {
 828                 dev_WARN(dev, "Device already exists on group %d\n",
 829                          iommu_group_id(iommu_group));
 830                 vfio_device_put(device);
 831                 vfio_group_put(group);
 832                 return -EBUSY;
 833         }
 834
 835         device = vfio_group_create_device(group, dev, ops, device_data);
 836         if (IS_ERR(device)) {
 837                 vfio_group_put(group);
 838                 return PTR_ERR(device);
 839         }
 840
 841         /*
 842          * Drop all but the vfio_device reference.  The vfio_device holds
 843          * a reference to the vfio_group, which holds a reference to the
 844          * iommu_group.
 845          */
 846         vfio_group_put(group);
 847
 848         return 0;
 849 }
 850 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 851
 852 /**
 853  * Get a reference to the vfio_device for a device.  Even if the
 854  * caller thinks they own the device, they could be racing with a
 855  * release call path, so we can't trust drvdata for the shortcut.
 856  * Go the long way around, from the iommu_group to the vfio_group
 857  * to the vfio_device.
 858  */
 859 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 860 {
 861         struct vfio_group *group;
 862         struct vfio_device *device;
 863
 864         group = vfio_group_get_from_dev(dev);
 865         if (!group)
 866                 return NULL;
 867
 868         device = vfio_group_get_device(group, dev);
 869         vfio_group_put(group);
 870
 871         return device;
 872 }
 873 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 874
 875 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 876                                                      char *buf)
 877 {
 878         struct vfio_device *it, *device = NULL;
 879
 880         mutex_lock(&group->device_lock);
 881         list_for_each_entry(it, &group->device_list, group_next) {
 882                 if (!strcmp(dev_name(it->dev), buf)) {
 883                         device = it;
 884                         vfio_device_get(device);
 885                         break;
 886                 }
 887         }
 888         mutex_unlock(&group->device_lock);
 889
 890         return device;
 891 }
 892
 893 /*
 894  * Caller must hold a reference to the vfio_device
 895  */
 896 void *vfio_device_data(struct vfio_device *device)
 897 {
 898         return device->device_data;
 899 }
 900 EXPORT_SYMBOL_GPL(vfio_device_data);
 901
 902 /*
 903  * Decrement the device reference count and wait for the device to be
 904  * removed.  Open file descriptors for the device... */
 905 void *vfio_del_group_dev(struct device *dev)
 906 {
 907         DEFINE_WAIT_FUNC(wait, woken_wake_function);
 908         struct vfio_device *device = dev_get_drvdata(dev);
 909         struct vfio_group *group = device->group;
 910         void *device_data = device->device_data;
 911         struct vfio_unbound_dev *unbound;
 912         unsigned int i = 0;
 913         bool interrupted = false;
 914
 915         /*
 916          * The group exists so long as we have a device reference.  Get
 917          * a group reference and use it to scan for the device going away.
 918          */
 919         vfio_group_get(group);
 920
 921         /*
 922          * When the device is removed from the group, the group suddenly
 923          * becomes non-viable; the device has a driver (until the unbind
 924          * completes), but it's not present in the group.  This is bad news
 925          * for any external users that need to re-acquire a group reference
 926          * in order to match and release their existing reference.  To
 927          * solve this, we track such devices on the unbound_list to bridge
 928          * the gap until they're fully unbound.
 929          */
 930         unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 931         if (unbound) {
 932                 unbound->dev = dev;
 933                 mutex_lock(&group->unbound_lock);
 934                 list_add(&unbound->unbound_next, &group->unbound_list);
 935                 mutex_unlock(&group->unbound_lock);
 936         }
 937         WARN_ON(!unbound);
 938
 939         vfio_device_put(device);
 940
 941         /*
 942          * If the device is still present in the group after the above
 943          * 'put', then it is in use and we need to request it from the
 944          * bus driver.  The driver may in turn need to request the
 945          * device from the user.  We send the request on an arbitrary
 946          * interval with counter to allow the driver to take escalating
 947          * measures to release the device if it has the ability to do so.
 948          */
 949         add_wait_queue(&vfio.release_q, &wait);
 950
 951         do {
 952                 device = vfio_group_get_device(group, dev);
 953                 if (!device)
 954                         break;
 955
 956                 if (device->ops->request)
 957                         device->ops->request(device_data, i++);
 958
 959                 vfio_device_put(device);
 960
 961                 if (interrupted) {
 962                         wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
 963                 } else {
 964                         wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
 965                         if (signal_pending(current)) {
 966                                 interrupted = true;
 967                                 dev_warn(dev,
 968                                          "Device is currently in use, task"
 969                                          " \"%s\" (%d) "
 970                                          "blocked until device is released",
 971                                          current->comm, task_pid_nr(current));
 972                         }
 973                 }
 974
 975         } while (1);
 976
 977         remove_wait_queue(&vfio.release_q, &wait);
 978         /*
 979          * In order to support multiple devices per group, devices can be
 980          * plucked from the group while other devices in the group are still
 981          * in use.  The container persists with this group and those remaining
 982          * devices still attached.  If the user creates an isolation violation
 983          * by binding this device to another driver while the group is still in
 984          * use, that's their fault.  However, in the case of removing the last,
 985          * or potentially the only, device in the group there can be no other
 986          * in-use devices in the group.  The user has done their due diligence
 987          * and we should lay no claims to those devices.  In order to do that,
 988          * we need to make sure the group is detached from the container.
 989          * Without this stall, we're potentially racing with a user process
 990          * that may attempt to immediately bind this device to another driver.
 991          */
 992         if (list_empty(&group->device_list))
 993                 wait_event(group->container_q, !group->container);
 994
 995         vfio_group_put(group);
 996
 997         return device_data;
 998 }
 999 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1000
1001 /**
1002  * VFIO base fd, /dev/vfio/vfio
1003  */
1004 static long vfio_ioctl_check_extension(struct vfio_container *container,
1005                                        unsigned long arg)
1006 {
1007         struct vfio_iommu_driver *driver;
1008         long ret = 0;
1009
1010         down_read(&container->group_lock);
1011
1012         driver = container->iommu_driver;
1013
1014         switch (arg) {
1015                 /* No base extensions yet */
1016         default:
1017                 /*
1018                  * If no driver is set, poll all registered drivers for
1019                  * extensions and return the first positive result.  If
1020                  * a driver is already set, further queries will be passed
1021                  * only to that driver.
1022                  */
1023                 if (!driver) {
1024                         mutex_lock(&vfio.iommu_drivers_lock);
1025                         list_for_each_entry(driver, &vfio.iommu_drivers_list,
1026                                             vfio_next) {
1027
1028 #ifdef CONFIG_VFIO_NOIOMMU
1029                                 if (!list_empty(&container->group_list) &&
1030                                     (container->noiommu !=
1031                                      (driver->ops == &vfio_noiommu_ops)))
1032                                         continue;
1033 #endif
1034
1035                                 if (!try_module_get(driver->ops->owner))
1036                                         continue;
1037
1038                                 ret = driver->ops->ioctl(NULL,
1039                                                          VFIO_CHECK_EXTENSION,
1040                                                          arg);
1041                                 module_put(driver->ops->owner);
1042                                 if (ret > 0)
1043                                         break;
1044                         }
1045                         mutex_unlock(&vfio.iommu_drivers_lock);
1046                 } else
1047                         ret = driver->ops->ioctl(container->iommu_data,
1048                                                  VFIO_CHECK_EXTENSION, arg);
1049         }
1050
1051         up_read(&container->group_lock);
1052
1053         return ret;
1054 }
1055
1056 /* hold write lock on container->group_lock */
1057 static int __vfio_container_attach_groups(struct vfio_container *container,
1058                                           struct vfio_iommu_driver *driver,
1059                                           void *data)
1060 {
1061         struct vfio_group *group;
1062         int ret = -ENODEV;
1063
1064         list_for_each_entry(group, &container->group_list, container_next) {
1065                 ret = driver->ops->attach_group(data, group->iommu_group);
1066                 if (ret)
1067                         goto unwind;
1068         }
1069
1070         return ret;
1071
1072 unwind:
1073         list_for_each_entry_continue_reverse(group, &container->group_list,
1074                                              container_next) {
1075                 driver->ops->detach_group(data, group->iommu_group);
1076         }
1077
1078         return ret;
1079 }
1080
1081 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1082                                  unsigned long arg)
1083 {
1084         struct vfio_iommu_driver *driver;
1085         long ret = -ENODEV;
1086
1087         down_write(&container->group_lock);
1088
1089         /*
1090          * The container is designed to be an unprivileged interface while
1091          * the group can be assigned to specific users.  Therefore, only by
1092          * adding a group to a container does the user get the privilege of
1093          * enabling the iommu, which may allocate finite resources.  There
1094          * is no unset_iommu, but by removing all the groups from a container,
1095          * the container is deprivileged and returns to an unset state.
1096          */
1097         if (list_empty(&container->group_list) || container->iommu_driver) {
1098                 up_write(&container->group_lock);
1099                 return -EINVAL;
1100         }
1101
1102         mutex_lock(&vfio.iommu_drivers_lock);
1103         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1104                 void *data;
1105
1106 #ifdef CONFIG_VFIO_NOIOMMU
1107                 /*
1108                  * Only noiommu containers can use vfio-noiommu and noiommu
1109                  * containers can only use vfio-noiommu.
1110                  */
1111                 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1112                         continue;
1113 #endif
1114
1115                 if (!try_module_get(driver->ops->owner))
1116                         continue;
1117
1118                 /*
1119                  * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1120                  * so test which iommu driver reported support for this
1121                  * extension and call open on them.  We also pass them the
1122                  * magic, allowing a single driver to support multiple
1123                  * interfaces if they'd like.
1124                  */
1125                 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1126                         module_put(driver->ops->owner);
1127                         continue;
1128                 }
1129
1130                 data = driver->ops->open(arg);
1131                 if (IS_ERR(data)) {
1132                         ret = PTR_ERR(data);
1133                         module_put(driver->ops->owner);
1134                         continue;
1135                 }
1136
1137                 ret = __vfio_container_attach_groups(container, driver, data);
1138                 if (ret) {
1139                         driver->ops->release(data);
1140                         module_put(driver->ops->owner);
1141                         continue;
1142                 }
1143
1144                 container->iommu_driver = driver;
1145                 container->iommu_data = data;
1146                 break;
1147         }
1148
1149         mutex_unlock(&vfio.iommu_drivers_lock);
1150         up_write(&container->group_lock);
1151
1152         return ret;
1153 }
1154
1155 static long vfio_fops_unl_ioctl(struct file *filep,
1156                                 unsigned int cmd, unsigned long arg)
1157 {
1158         struct vfio_container *container = filep->private_data;
1159         struct vfio_iommu_driver *driver;
1160         void *data;
1161         long ret = -EINVAL;
1162
1163         if (!container)
1164                 return ret;
1165
1166         switch (cmd) {
1167         case VFIO_GET_API_VERSION:
1168                 ret = VFIO_API_VERSION;
1169                 break;
1170         case VFIO_CHECK_EXTENSION:
1171                 ret = vfio_ioctl_check_extension(container, arg);
1172                 break;
1173         case VFIO_SET_IOMMU:
1174                 ret = vfio_ioctl_set_iommu(container, arg);
1175                 break;
1176         default:
1177                 driver = container->iommu_driver;
1178                 data = container->iommu_data;
1179
1180                 if (driver) /* passthrough all unrecognized ioctls */
1181                         ret = driver->ops->ioctl(data, cmd, arg);
1182         }
1183
1184         return ret;
1185 }
1186
1187 #ifdef CONFIG_COMPAT
1188 static long vfio_fops_compat_ioctl(struct file *filep,
1189                                    unsigned int cmd, unsigned long arg)
1190 {
1191         arg = (unsigned long)compat_ptr(arg);
1192         return vfio_fops_unl_ioctl(filep, cmd, arg);
1193 }
1194 #endif  /* CONFIG_COMPAT */
1195
1196 static int vfio_fops_open(struct inode *inode, struct file *filep)
1197 {
1198         struct vfio_container *container;
1199
1200         container = kzalloc(sizeof(*container), GFP_KERNEL);
1201         if (!container)
1202                 return -ENOMEM;
1203
1204         INIT_LIST_HEAD(&container->group_list);
1205         init_rwsem(&container->group_lock);
1206         kref_init(&container->kref);
1207
1208         filep->private_data = container;
1209
1210         return 0;
1211 }
1212
1213 static int vfio_fops_release(struct inode *inode, struct file *filep)
1214 {
1215         struct vfio_container *container = filep->private_data;
1216
1217         filep->private_data = NULL;
1218
1219         vfio_container_put(container);
1220
1221         return 0;
1222 }
1223
1224 /*
1225  * Once an iommu driver is set, we optionally pass read/write/mmap
1226  * on to the driver, allowing management interfaces beyond ioctl.
1227  */
1228 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1229                               size_t count, loff_t *ppos)
1230 {
1231         struct vfio_container *container = filep->private_data;
1232         struct vfio_iommu_driver *driver;
1233         ssize_t ret = -EINVAL;
1234
1235         driver = container->iommu_driver;
1236         if (likely(driver && driver->ops->read))
1237                 ret = driver->ops->read(container->iommu_data,
1238                                         buf, count, ppos);
1239
1240         return ret;
1241 }
1242
1243 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1244                                size_t count, loff_t *ppos)
1245 {
1246         struct vfio_container *container = filep->private_data;
1247         struct vfio_iommu_driver *driver;
1248         ssize_t ret = -EINVAL;
1249
1250         driver = container->iommu_driver;
1251         if (likely(driver && driver->ops->write))
1252                 ret = driver->ops->write(container->iommu_data,
1253                                          buf, count, ppos);
1254
1255         return ret;
1256 }
1257
1258 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1259 {
1260         struct vfio_container *container = filep->private_data;
1261         struct vfio_iommu_driver *driver;
1262         int ret = -EINVAL;
1263
1264         driver = container->iommu_driver;
1265         if (likely(driver && driver->ops->mmap))
1266                 ret = driver->ops->mmap(container->iommu_data, vma);
1267
1268         return ret;
1269 }
1270
1271 static const struct file_operations vfio_fops = {
1272         .owner          = THIS_MODULE,
1273         .open           = vfio_fops_open,
1274         .release        = vfio_fops_release,
1275         .read           = vfio_fops_read,
1276         .write          = vfio_fops_write,
1277         .unlocked_ioctl = vfio_fops_unl_ioctl,
1278 #ifdef CONFIG_COMPAT
1279         .compat_ioctl   = vfio_fops_compat_ioctl,
1280 #endif
1281         .mmap           = vfio_fops_mmap,
1282 };
1283
1284 /**
1285  * VFIO Group fd, /dev/vfio/$GROUP
1286  */
1287 static void __vfio_group_unset_container(struct vfio_group *group)
1288 {
1289         struct vfio_container *container = group->container;
1290         struct vfio_iommu_driver *driver;
1291
1292         down_write(&container->group_lock);
1293
1294         driver = container->iommu_driver;
1295         if (driver)
1296                 driver->ops->detach_group(container->iommu_data,
1297                                           group->iommu_group);
1298
1299         group->container = NULL;
1300         wake_up(&group->container_q);
1301         list_del(&group->container_next);
1302
1303         /* Detaching the last group deprivileges a container, remove iommu */
1304         if (driver && list_empty(&container->group_list)) {
1305                 driver->ops->release(container->iommu_data);
1306                 module_put(driver->ops->owner);
1307                 container->iommu_driver = NULL;
1308                 container->iommu_data = NULL;
1309         }
1310
1311         up_write(&container->group_lock);
1312
1313         vfio_container_put(container);
1314 }
1315
1316 /*
1317  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1318  * if there was no container to unset.  Since the ioctl is called on
1319  * the group, we know that still exists, therefore the only valid
1320  * transition here is 1->0.
1321  */
1322 static int vfio_group_unset_container(struct vfio_group *group)
1323 {
1324         int users = atomic_cmpxchg(&group->container_users, 1, 0);
1325
1326         if (!users)
1327                 return -EINVAL;
1328         if (users != 1)
1329                 return -EBUSY;
1330
1331         __vfio_group_unset_container(group);
1332
1333         return 0;
1334 }
1335
1336 /*
1337  * When removing container users, anything that removes the last user
1338  * implicitly removes the group from the container.  That is, if the
1339  * group file descriptor is closed, as well as any device file descriptors,
1340  * the group is free.
1341  */
1342 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1343 {
1344         if (0 == atomic_dec_if_positive(&group->container_users))
1345                 __vfio_group_unset_container(group);
1346 }
1347
1348 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1349 {
1350         struct fd f;
1351         struct vfio_container *container;
1352         struct vfio_iommu_driver *driver;
1353         int ret = 0;
1354
1355         if (atomic_read(&group->container_users))
1356                 return -EINVAL;
1357
1358         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1359                 return -EPERM;
1360
1361         f = fdget(container_fd);
1362         if (!f.file)
1363                 return -EBADF;
1364
1365         /* Sanity check, is this really our fd? */
1366         if (f.file->f_op != &vfio_fops) {
1367                 fdput(f);
1368                 return -EINVAL;
1369         }
1370
1371         container = f.file->private_data;
1372         WARN_ON(!container); /* fget ensures we don't race vfio_release */
1373
1374         down_write(&container->group_lock);
1375
1376         /* Real groups and fake groups cannot mix */
1377         if (!list_empty(&container->group_list) &&
1378             container->noiommu != group->noiommu) {
1379                 ret = -EPERM;
1380                 goto unlock_out;
1381         }
1382
1383         driver = container->iommu_driver;
1384         if (driver) {
1385                 ret = driver->ops->attach_group(container->iommu_data,
1386                                                 group->iommu_group);
1387                 if (ret)
1388                         goto unlock_out;
1389         }
1390
1391         group->container = container;
1392         container->noiommu = group->noiommu;
1393         list_add(&group->container_next, &container->group_list);
1394
1395         /* Get a reference on the container and mark a user within the group */
1396         vfio_container_get(container);
1397         atomic_inc(&group->container_users);
1398
1399 unlock_out:
1400         up_write(&container->group_lock);
1401         fdput(f);
1402         return ret;
1403 }
1404
1405 static bool vfio_group_viable(struct vfio_group *group)
1406 {
1407         return (iommu_group_for_each_dev(group->iommu_group,
1408                                          group, vfio_dev_viable) == 0);
1409 }
1410
1411 static int vfio_group_add_container_user(struct vfio_group *group)
1412 {
1413         if (!atomic_inc_not_zero(&group->container_users))
1414                 return -EINVAL;
1415
1416         if (group->noiommu) {
1417                 atomic_dec(&group->container_users);
1418                 return -EPERM;
1419         }
1420         if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1421                 atomic_dec(&group->container_users);
1422                 return -EINVAL;
1423         }
1424
1425         return 0;
1426 }
1427
1428 static const struct file_operations vfio_device_fops;
1429
1430 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1431 {
1432         struct vfio_device *device;
1433         struct file *filep;
1434         int ret;
1435
1436         if (0 == atomic_read(&group->container_users) ||
1437             !group->container->iommu_driver || !vfio_group_viable(group))
1438                 return -EINVAL;
1439
1440         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1441                 return -EPERM;
1442
1443         device = vfio_device_get_from_name(group, buf);
1444         if (!device)
1445                 return -ENODEV;
1446
1447         ret = device->ops->open(device->device_data);
1448         if (ret) {
1449                 vfio_device_put(device);
1450                 return ret;
1451         }
1452
1453         /*
1454          * We can't use anon_inode_getfd() because we need to modify
1455          * the f_mode flags directly to allow more than just ioctls
1456          */
1457         ret = get_unused_fd_flags(O_CLOEXEC);
1458         if (ret < 0) {
1459                 device->ops->release(device->device_data);
1460                 vfio_device_put(device);
1461                 return ret;
1462         }
1463
1464         filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1465                                    device, O_RDWR);
1466         if (IS_ERR(filep)) {
1467                 put_unused_fd(ret);
1468                 ret = PTR_ERR(filep);
1469                 device->ops->release(device->device_data);
1470                 vfio_device_put(device);
1471                 return ret;
1472         }
1473
1474         /*
1475          * TODO: add an anon_inode interface to do this.
1476          * Appears to be missing by lack of need rather than
1477          * explicitly prevented.  Now there's need.
1478          */
1479         filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1480
1481         atomic_inc(&group->container_users);
1482
1483         fd_install(ret, filep);
1484
1485         if (group->noiommu)
1486                 dev_warn(device->dev, "vfio-noiommu device opened by user "
1487                          "(%s:%d)\n", current->comm, task_pid_nr(current));
1488
1489         return ret;
1490 }
1491
1492 static long vfio_group_fops_unl_ioctl(struct file *filep,
1493                                       unsigned int cmd, unsigned long arg)
1494 {
1495         struct vfio_group *group = filep->private_data;
1496         long ret = -ENOTTY;
1497
1498         switch (cmd) {
1499         case VFIO_GROUP_GET_STATUS:
1500         {
1501                 struct vfio_group_status status;
1502                 unsigned long minsz;
1503
1504                 minsz = offsetofend(struct vfio_group_status, flags);
1505
1506                 if (copy_from_user(&status, (void __user *)arg, minsz))
1507                         return -EFAULT;
1508
1509                 if (status.argsz < minsz)
1510                         return -EINVAL;
1511
1512                 status.flags = 0;
1513
1514                 if (vfio_group_viable(group))
1515                         status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1516
1517                 if (group->container)
1518                         status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1519
1520                 if (copy_to_user((void __user *)arg, &status, minsz))
1521                         return -EFAULT;
1522
1523                 ret = 0;
1524                 break;
1525         }
1526         case VFIO_GROUP_SET_CONTAINER:
1527         {
1528                 int fd;
1529
1530                 if (get_user(fd, (int __user *)arg))
1531                         return -EFAULT;
1532
1533                 if (fd < 0)
1534                         return -EINVAL;
1535
1536                 ret = vfio_group_set_container(group, fd);
1537                 break;
1538         }
1539         case VFIO_GROUP_UNSET_CONTAINER:
1540                 ret = vfio_group_unset_container(group);
1541                 break;
1542         case VFIO_GROUP_GET_DEVICE_FD:
1543         {
1544                 char *buf;
1545
1546                 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1547                 if (IS_ERR(buf))
1548                         return PTR_ERR(buf);
1549
1550                 ret = vfio_group_get_device_fd(group, buf);
1551                 kfree(buf);
1552                 break;
1553         }
1554         }
1555
1556         return ret;
1557 }
1558
1559 #ifdef CONFIG_COMPAT
1560 static long vfio_group_fops_compat_ioctl(struct file *filep,
1561                                          unsigned int cmd, unsigned long arg)
1562 {
1563         arg = (unsigned long)compat_ptr(arg);
1564         return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1565 }
1566 #endif  /* CONFIG_COMPAT */
1567
1568 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1569 {
1570         struct vfio_group *group;
1571         int opened;
1572
1573         group = vfio_group_get_from_minor(iminor(inode));
1574         if (!group)
1575                 return -ENODEV;
1576
1577         if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1578                 vfio_group_put(group);
1579                 return -EPERM;
1580         }
1581
1582         /* Do we need multiple instances of the group open?  Seems not. */
1583         opened = atomic_cmpxchg(&group->opened, 0, 1);
1584         if (opened) {
1585                 vfio_group_put(group);
1586                 return -EBUSY;
1587         }
1588
1589         /* Is something still in use from a previous open? */
1590         if (group->container) {
1591                 atomic_dec(&group->opened);
1592                 vfio_group_put(group);
1593                 return -EBUSY;
1594         }
1595
1596         /* Warn if previous user didn't cleanup and re-init to drop them */
1597         if (WARN_ON(group->notifier.head))
1598                 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1599
1600         filep->private_data = group;
1601
1602         return 0;
1603 }
1604
1605 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1606 {
1607         struct vfio_group *group = filep->private_data;
1608
1609         filep->private_data = NULL;
1610
1611         vfio_group_try_dissolve_container(group);
1612
1613         atomic_dec(&group->opened);
1614
1615         vfio_group_put(group);
1616
1617         return 0;
1618 }
1619
1620 static const struct file_operations vfio_group_fops = {
1621         .owner          = THIS_MODULE,
1622         .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1623 #ifdef CONFIG_COMPAT
1624         .compat_ioctl   = vfio_group_fops_compat_ioctl,
1625 #endif
1626         .open           = vfio_group_fops_open,
1627         .release        = vfio_group_fops_release,
1628 };
1629
1630 /**
1631  * VFIO Device fd
1632  */
1633 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1634 {
1635         struct vfio_device *device = filep->private_data;
1636
1637         device->ops->release(device->device_data);
1638
1639         vfio_group_try_dissolve_container(device->group);
1640
1641         vfio_device_put(device);
1642
1643         return 0;
1644 }
1645
1646 static long vfio_device_fops_unl_ioctl(struct file *filep,
1647                                        unsigned int cmd, unsigned long arg)
1648 {
1649         struct vfio_device *device = filep->private_data;
1650
1651         if (unlikely(!device->ops->ioctl))
1652                 return -EINVAL;
1653
1654         return device->ops->ioctl(device->device_data, cmd, arg);
1655 }
1656
1657 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1658                                      size_t count, loff_t *ppos)
1659 {
1660         struct vfio_device *device = filep->private_data;
1661
1662         if (unlikely(!device->ops->read))
1663                 return -EINVAL;
1664
1665         return device->ops->read(device->device_data, buf, count, ppos);
1666 }
1667
1668 static ssize_t vfio_device_fops_write(struct file *filep,
1669                                       const char __user *buf,
1670                                       size_t count, loff_t *ppos)
1671 {
1672         struct vfio_device *device = filep->private_data;
1673
1674         if (unlikely(!device->ops->write))
1675                 return -EINVAL;
1676
1677         return device->ops->write(device->device_data, buf, count, ppos);
1678 }
1679
1680 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1681 {
1682         struct vfio_device *device = filep->private_data;
1683
1684         if (unlikely(!device->ops->mmap))
1685                 return -EINVAL;
1686
1687         return device->ops->mmap(device->device_data, vma);
1688 }
1689
1690 #ifdef CONFIG_COMPAT
1691 static long vfio_device_fops_compat_ioctl(struct file *filep,
1692                                           unsigned int cmd, unsigned long arg)
1693 {
1694         arg = (unsigned long)compat_ptr(arg);
1695         return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1696 }
1697 #endif  /* CONFIG_COMPAT */
1698
1699 static const struct file_operations vfio_device_fops = {
1700         .owner          = THIS_MODULE,
1701         .release        = vfio_device_fops_release,
1702         .read           = vfio_device_fops_read,
1703         .write          = vfio_device_fops_write,
1704         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1705 #ifdef CONFIG_COMPAT
1706         .compat_ioctl   = vfio_device_fops_compat_ioctl,
1707 #endif
1708         .mmap           = vfio_device_fops_mmap,
1709 };
1710
1711 /**
1712  * External user API, exported by symbols to be linked dynamically.
1713  *
1714  * The protocol includes:
1715  *  1. do normal VFIO init operation:
1716  *      - opening a new container;
1717  *      - attaching group(s) to it;
1718  *      - setting an IOMMU driver for a container.
1719  * When IOMMU is set for a container, all groups in it are
1720  * considered ready to use by an external user.
1721  *
1722  * 2. User space passes a group fd to an external user.
1723  * The external user calls vfio_group_get_external_user()
1724  * to verify that:
1725  *      - the group is initialized;
1726  *      - IOMMU is set for it.
1727  * If both checks passed, vfio_group_get_external_user()
1728  * increments the container user counter to prevent
1729  * the VFIO group from disposal before KVM exits.
1730  *
1731  * 3. The external user calls vfio_external_user_iommu_id()
1732  * to know an IOMMU ID.
1733  *
1734  * 4. When the external KVM finishes, it calls
1735  * vfio_group_put_external_user() to release the VFIO group.
1736  * This call decrements the container user counter.
1737  */
1738 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1739 {
1740         struct vfio_group *group = filep->private_data;
1741         int ret;
1742
1743         if (filep->f_op != &vfio_group_fops)
1744                 return ERR_PTR(-EINVAL);
1745
1746         ret = vfio_group_add_container_user(group);
1747         if (ret)
1748                 return ERR_PTR(ret);
1749
1750         vfio_group_get(group);
1751
1752         return group;
1753 }
1754 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1755
1756 void vfio_group_put_external_user(struct vfio_group *group)
1757 {
1758         vfio_group_try_dissolve_container(group);
1759         vfio_group_put(group);
1760 }
1761 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1762
1763 bool vfio_external_group_match_file(struct vfio_group *test_group,
1764                                     struct file *filep)
1765 {
1766         struct vfio_group *group = filep->private_data;
1767
1768         return (filep->f_op == &vfio_group_fops) && (group == test_group);
1769 }
1770 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1771
1772 int vfio_external_user_iommu_id(struct vfio_group *group)
1773 {
1774         return iommu_group_id(group->iommu_group);
1775 }
1776 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1777
1778 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1779 {
1780         return vfio_ioctl_check_extension(group->container, arg);
1781 }
1782 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1783
1784 /**
1785  * Sub-module support
1786  */
1787 /*
1788  * Helper for managing a buffer of info chain capabilities, allocate or
1789  * reallocate a buffer with additional @size, filling in @id and @version
1790  * of the capability.  A pointer to the new capability is returned.
1791  *
1792  * NB. The chain is based at the head of the buffer, so new entries are
1793  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1794  * next offsets prior to copying to the user buffer.
1795  */
1796 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1797                                                size_t size, u16 id, u16 version)
1798 {
1799         void *buf;
1800         struct vfio_info_cap_header *header, *tmp;
1801
1802         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1803         if (!buf) {
1804                 kfree(caps->buf);
1805                 caps->size = 0;
1806                 return ERR_PTR(-ENOMEM);
1807         }
1808
1809         caps->buf = buf;
1810         header = buf + caps->size;
1811
1812         /* Eventually copied to user buffer, zero */
1813         memset(header, 0, size);
1814
1815         header->id = id;
1816         header->version = version;
1817
1818         /* Add to the end of the capability chain */
1819         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1820                 ; /* nothing */
1821
1822         tmp->next = caps->size;
1823         caps->size += size;
1824
1825         return header;
1826 }
1827 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1828
1829 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1830 {
1831         struct vfio_info_cap_header *tmp;
1832         void *buf = (void *)caps->buf;
1833
1834         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1835                 tmp->next += offset;
1836 }
1837 EXPORT_SYMBOL(vfio_info_cap_shift);
1838
1839 int vfio_info_add_capability(struct vfio_info_cap *caps,
1840                              struct vfio_info_cap_header *cap, size_t size)
1841 {
1842         struct vfio_info_cap_header *header;
1843
1844         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1845         if (IS_ERR(header))
1846                 return PTR_ERR(header);
1847
1848         memcpy(header + 1, cap + 1, size - sizeof(*header));
1849
1850         return 0;
1851 }
1852 EXPORT_SYMBOL(vfio_info_add_capability);
1853
1854 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1855                                        int max_irq_type, size_t *data_size)
1856 {
1857         unsigned long minsz;
1858         size_t size;
1859
1860         minsz = offsetofend(struct vfio_irq_set, count);
1861
1862         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1863             (hdr->count >= (U32_MAX - hdr->start)) ||
1864             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1865                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1866                 return -EINVAL;
1867
1868         if (data_size)
1869                 *data_size = 0;
1870
1871         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1872                 return -EINVAL;
1873
1874         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1875         case VFIO_IRQ_SET_DATA_NONE:
1876                 size = 0;
1877                 break;
1878         case VFIO_IRQ_SET_DATA_BOOL:
1879                 size = sizeof(uint8_t);
1880                 break;
1881         case VFIO_IRQ_SET_DATA_EVENTFD:
1882                 size = sizeof(int32_t);
1883                 break;
1884         default:
1885                 return -EINVAL;
1886         }
1887
1888         if (size) {
1889                 if (hdr->argsz - minsz < hdr->count * size)
1890                         return -EINVAL;
1891
1892                 if (!data_size)
1893                         return -EINVAL;
1894
1895                 *data_size = hdr->count * size;
1896         }
1897
1898         return 0;
1899 }
1900 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1901
1902 /*
1903  * Pin a set of guest PFNs and return their associated host PFNs for local
1904  * domain only.
1905  * @dev [in]     : device
1906  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1907  * @npage [in]   : count of elements in user_pfn array.  This count should not
1908  *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1909  * @prot [in]    : protection flags
1910  * @phys_pfn[out]: array of host PFNs
1911  * Return error or number of pages pinned.
1912  */
1913 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1914                    int prot, unsigned long *phys_pfn)
1915 {
1916         struct vfio_container *container;
1917         struct vfio_group *group;
1918         struct vfio_iommu_driver *driver;
1919         int ret;
1920
1921         if (!dev || !user_pfn || !phys_pfn || !npage)
1922                 return -EINVAL;
1923
1924         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1925                 return -E2BIG;
1926
1927         group = vfio_group_get_from_dev(dev);
1928         if (!group)
1929                 return -ENODEV;
1930
1931         ret = vfio_group_add_container_user(group);
1932         if (ret)
1933                 goto err_pin_pages;
1934
1935         container = group->container;
1936         driver = container->iommu_driver;
1937         if (likely(driver && driver->ops->pin_pages))
1938                 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1939                                              npage, prot, phys_pfn);
1940         else
1941                 ret = -ENOTTY;
1942
1943         vfio_group_try_dissolve_container(group);
1944
1945 err_pin_pages:
1946         vfio_group_put(group);
1947         return ret;
1948 }
1949 EXPORT_SYMBOL(vfio_pin_pages);
1950
1951 /*
1952  * Unpin set of host PFNs for local domain only.
1953  * @dev [in]     : device
1954  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1955  *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1956  * @npage [in]   : count of elements in user_pfn array.  This count should not
1957  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1958  * Return error or number of pages unpinned.
1959  */
1960 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1961 {
1962         struct vfio_container *container;
1963         struct vfio_group *group;
1964         struct vfio_iommu_driver *driver;
1965         int ret;
1966
1967         if (!dev || !user_pfn || !npage)
1968                 return -EINVAL;
1969
1970         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1971                 return -E2BIG;
1972
1973         group = vfio_group_get_from_dev(dev);
1974         if (!group)
1975                 return -ENODEV;
1976
1977         ret = vfio_group_add_container_user(group);
1978         if (ret)
1979                 goto err_unpin_pages;
1980
1981         container = group->container;
1982         driver = container->iommu_driver;
1983         if (likely(driver && driver->ops->unpin_pages))
1984                 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1985                                                npage);
1986         else
1987                 ret = -ENOTTY;
1988
1989         vfio_group_try_dissolve_container(group);
1990
1991 err_unpin_pages:
1992         vfio_group_put(group);
1993         return ret;
1994 }
1995 EXPORT_SYMBOL(vfio_unpin_pages);
1996
1997 static int vfio_register_iommu_notifier(struct vfio_group *group,
1998                                         unsigned long *events,
1999                                         struct notifier_block *nb)
2000 {
2001         struct vfio_container *container;
2002         struct vfio_iommu_driver *driver;
2003         int ret;
2004
2005         ret = vfio_group_add_container_user(group);
2006         if (ret)
2007                 return -EINVAL;
2008
2009         container = group->container;
2010         driver = container->iommu_driver;
2011         if (likely(driver && driver->ops->register_notifier))
2012                 ret = driver->ops->register_notifier(container->iommu_data,
2013                                                      events, nb);
2014         else
2015                 ret = -ENOTTY;
2016
2017         vfio_group_try_dissolve_container(group);
2018
2019         return ret;
2020 }
2021
2022 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2023                                           struct notifier_block *nb)
2024 {
2025         struct vfio_container *container;
2026         struct vfio_iommu_driver *driver;
2027         int ret;
2028
2029         ret = vfio_group_add_container_user(group);
2030         if (ret)
2031                 return -EINVAL;
2032
2033         container = group->container;
2034         driver = container->iommu_driver;
2035         if (likely(driver && driver->ops->unregister_notifier))
2036                 ret = driver->ops->unregister_notifier(container->iommu_data,
2037                                                        nb);
2038         else
2039                 ret = -ENOTTY;
2040
2041         vfio_group_try_dissolve_container(group);
2042
2043         return ret;
2044 }
2045
2046 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2047 {
2048         group->kvm = kvm;
2049         blocking_notifier_call_chain(&group->notifier,
2050                                 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2051 }
2052 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2053
2054 static int vfio_register_group_notifier(struct vfio_group *group,
2055                                         unsigned long *events,
2056                                         struct notifier_block *nb)
2057 {
2058         int ret;
2059         bool set_kvm = false;
2060
2061         if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2062                 set_kvm = true;
2063
2064         /* clear known events */
2065         *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2066
2067         /* refuse to continue if still events remaining */
2068         if (*events)
2069                 return -EINVAL;
2070
2071         ret = vfio_group_add_container_user(group);
2072         if (ret)
2073                 return -EINVAL;
2074
2075         ret = blocking_notifier_chain_register(&group->notifier, nb);
2076
2077         /*
2078          * The attaching of kvm and vfio_group might already happen, so
2079          * here we replay once upon registration.
2080          */
2081         if (!ret && set_kvm && group->kvm)
2082                 blocking_notifier_call_chain(&group->notifier,
2083                                         VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2084
2085         vfio_group_try_dissolve_container(group);
2086
2087         return ret;
2088 }
2089
2090 static int vfio_unregister_group_notifier(struct vfio_group *group,
2091                                          struct notifier_block *nb)
2092 {
2093         int ret;
2094
2095         ret = vfio_group_add_container_user(group);
2096         if (ret)
2097                 return -EINVAL;
2098
2099         ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2100
2101         vfio_group_try_dissolve_container(group);
2102
2103         return ret;
2104 }
2105
2106 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2107                            unsigned long *events, struct notifier_block *nb)
2108 {
2109         struct vfio_group *group;
2110         int ret;
2111
2112         if (!dev || !nb || !events || (*events == 0))
2113                 return -EINVAL;
2114
2115         group = vfio_group_get_from_dev(dev);
2116         if (!group)
2117                 return -ENODEV;
2118
2119         switch (type) {
2120         case VFIO_IOMMU_NOTIFY:
2121                 ret = vfio_register_iommu_notifier(group, events, nb);
2122                 break;
2123         case VFIO_GROUP_NOTIFY:
2124                 ret = vfio_register_group_notifier(group, events, nb);
2125                 break;
2126         default:
2127                 ret = -EINVAL;
2128         }
2129
2130         vfio_group_put(group);
2131         return ret;
2132 }
2133 EXPORT_SYMBOL(vfio_register_notifier);
2134
2135 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2136                              struct notifier_block *nb)
2137 {
2138         struct vfio_group *group;
2139         int ret;
2140
2141         if (!dev || !nb)
2142                 return -EINVAL;
2143
2144         group = vfio_group_get_from_dev(dev);
2145         if (!group)
2146                 return -ENODEV;
2147
2148         switch (type) {
2149         case VFIO_IOMMU_NOTIFY:
2150                 ret = vfio_unregister_iommu_notifier(group, nb);
2151                 break;
2152         case VFIO_GROUP_NOTIFY:
2153                 ret = vfio_unregister_group_notifier(group, nb);
2154                 break;
2155         default:
2156                 ret = -EINVAL;
2157         }
2158
2159         vfio_group_put(group);
2160         return ret;
2161 }
2162 EXPORT_SYMBOL(vfio_unregister_notifier);
2163
2164 /**
2165  * Module/class support
2166  */
2167 static char *vfio_devnode(struct device *dev, umode_t *mode)
2168 {
2169         return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2170 }
2171
2172 static struct miscdevice vfio_dev = {
2173         .minor = VFIO_MINOR,
2174         .name = "vfio",
2175         .fops = &vfio_fops,
2176         .nodename = "vfio/vfio",
2177         .mode = S_IRUGO | S_IWUGO,
2178 };
2179
2180 static int __init vfio_init(void)
2181 {
2182         int ret;
2183
2184         idr_init(&vfio.group_idr);
2185         mutex_init(&vfio.group_lock);
2186         mutex_init(&vfio.iommu_drivers_lock);
2187         INIT_LIST_HEAD(&vfio.group_list);
2188         INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2189         init_waitqueue_head(&vfio.release_q);
2190
2191         ret = misc_register(&vfio_dev);
2192         if (ret) {
2193                 pr_err("vfio: misc device register failed\n");
2194                 return ret;
2195         }
2196
2197         /* /dev/vfio/$GROUP */
2198         vfio.class = class_create(THIS_MODULE, "vfio");
2199         if (IS_ERR(vfio.class)) {
2200                 ret = PTR_ERR(vfio.class);
2201                 goto err_class;
2202         }
2203
2204         vfio.class->devnode = vfio_devnode;
2205
2206         ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2207         if (ret)
2208                 goto err_alloc_chrdev;
2209
2210         cdev_init(&vfio.group_cdev, &vfio_group_fops);
2211         ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2212         if (ret)
2213                 goto err_cdev_add;
2214
2215         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2216
2217 #ifdef CONFIG_VFIO_NOIOMMU
2218         vfio_register_iommu_driver(&vfio_noiommu_ops);
2219 #endif
2220         return 0;
2221
2222 err_cdev_add:
2223         unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2224 err_alloc_chrdev:
2225         class_destroy(vfio.class);
2226         vfio.class = NULL;
2227 err_class:
2228         misc_deregister(&vfio_dev);
2229         return ret;
2230 }
2231
2232 static void __exit vfio_cleanup(void)
2233 {
2234         WARN_ON(!list_empty(&vfio.group_list));
2235
2236 #ifdef CONFIG_VFIO_NOIOMMU
2237         vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2238 #endif
2239         idr_destroy(&vfio.group_idr);
2240         cdev_del(&vfio.group_cdev);
2241         unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2242         class_destroy(vfio.class);
2243         vfio.class = NULL;
2244         misc_deregister(&vfio_dev);
2245 }
2246
2247 module_init(vfio_init);
2248 module_exit(vfio_cleanup);
2249
2250 MODULE_VERSION(DRIVER_VERSION);
2251 MODULE_LICENSE("GPL v2");
2252 MODULE_AUTHOR(DRIVER_AUTHOR);
2253 MODULE_DESCRIPTION(DRIVER_DESC);
2254 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2255 MODULE_ALIAS("devname:vfio/vfio");
2256 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");