hw/vfio/pci.c

   1 /*
   2  * vfio based device assignment support
   3  *
   4  * Copyright Red Hat, Inc. 2012
   5  *
   6  * Authors:
   7  *  Alex Williamson <[email protected]>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  * Based on qemu-kvm device-assignment:
  13  *  Adapted for KVM by Qumranet.
  14  *  Copyright (c) 2007, Neocleus, Alex Novik ([email protected])
  15  *  Copyright (c) 2007, Neocleus, Guy Zana ([email protected])
  16  *  Copyright (C) 2008, Qumranet, Amit Shah ([email protected])
  17  *  Copyright (C) 2008, Red Hat, Amit Shah ([email protected])
  18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda ([email protected])
  19  */
  20
  21 #include <linux/vfio.h>
  22 #include <sys/ioctl.h>
  23 #include <sys/mman.h>
  24 #include <sys/stat.h>
  25 #include <sys/types.h>
  26 #include <unistd.h>
  27
  28 #include "config.h"
  29 #include "hw/pci/msi.h"
  30 #include "hw/pci/msix.h"
  31 #include "hw/pci/pci_bridge.h"
  32 #include "qemu/error-report.h"
  33 #include "qemu/range.h"
  34 #include "sysemu/kvm.h"
  35 #include "sysemu/sysemu.h"
  36 #include "pci.h"
  37 #include "trace.h"
  38
  39 #define MSIX_CAP_LENGTH 12
  40
  41 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
  42 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
  43
  44 /*
  45  * Disabling BAR mmaping can be slow, but toggling it around INTx can
  46  * also be a huge overhead.  We try to get the best of both worlds by
  47  * waiting until an interrupt to disable mmaps (subsequent transitions
  48  * to the same state are effectively no overhead).  If the interrupt has
  49  * been serviced and the time gap is long enough, we re-enable mmaps for
  50  * performance.  This works well for things like graphics cards, which
  51  * may not use their interrupt at all and are penalized to an unusable
  52  * level by read/write BAR traps.  Other devices, like NICs, have more
  53  * regular interrupts and see much better latency by staying in non-mmap
  54  * mode.  We therefore set the default mmap_timeout such that a ping
  55  * is just enough to keep the mmap disabled.  Users can experiment with
  56  * other options with the x-intx-mmap-timeout-ms parameter (a value of
  57  * zero disables the timer).
  58  */
  59 static void vfio_intx_mmap_enable(void *opaque)
  60 {
  61     VFIOPCIDevice *vdev = opaque;
  62
  63     if (vdev->intx.pending) {
  64         timer_mod(vdev->intx.mmap_timer,
  65                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
  66         return;
  67     }
  68
  69     vfio_mmap_set_enabled(vdev, true);
  70 }
  71
  72 static void vfio_intx_interrupt(void *opaque)
  73 {
  74     VFIOPCIDevice *vdev = opaque;
  75
  76     if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
  77         return;
  78     }
  79
  80     trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
  81
  82     vdev->intx.pending = true;
  83     pci_irq_assert(&vdev->pdev);
  84     vfio_mmap_set_enabled(vdev, false);
  85     if (vdev->intx.mmap_timeout) {
  86         timer_mod(vdev->intx.mmap_timer,
  87                        qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
  88     }
  89 }
  90
  91 static void vfio_intx_eoi(VFIODevice *vbasedev)
  92 {
  93     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
  94
  95     if (!vdev->intx.pending) {
  96         return;
  97     }
  98
  99     trace_vfio_intx_eoi(vbasedev->name);
 100
 101     vdev->intx.pending = false;
 102     pci_irq_deassert(&vdev->pdev);
 103     vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 104 }
 105
 106 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev)
 107 {
 108 #ifdef CONFIG_KVM
 109     struct kvm_irqfd irqfd = {
 110         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 111         .gsi = vdev->intx.route.irq,
 112         .flags = KVM_IRQFD_FLAG_RESAMPLE,
 113     };
 114     struct vfio_irq_set *irq_set;
 115     int ret, argsz;
 116     int32_t *pfd;
 117
 118     if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
 119         vdev->intx.route.mode != PCI_INTX_ENABLED ||
 120         !kvm_resamplefds_enabled()) {
 121         return;
 122     }
 123
 124     /* Get to a known interrupt state */
 125     qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
 126     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 127     vdev->intx.pending = false;
 128     pci_irq_deassert(&vdev->pdev);
 129
 130     /* Get an eventfd for resample/unmask */
 131     if (event_notifier_init(&vdev->intx.unmask, 0)) {
 132         error_report("vfio: Error: event_notifier_init failed eoi");
 133         goto fail;
 134     }
 135
 136     /* KVM triggers it, VFIO listens for it */
 137     irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
 138
 139     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 140         error_report("vfio: Error: Failed to setup resample irqfd: %m");
 141         goto fail_irqfd;
 142     }
 143
 144     argsz = sizeof(*irq_set) + sizeof(*pfd);
 145
 146     irq_set = g_malloc0(argsz);
 147     irq_set->argsz = argsz;
 148     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK;
 149     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 150     irq_set->start = 0;
 151     irq_set->count = 1;
 152     pfd = (int32_t *)&irq_set->data;
 153
 154     *pfd = irqfd.resamplefd;
 155
 156     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 157     g_free(irq_set);
 158     if (ret) {
 159         error_report("vfio: Error: Failed to setup INTx unmask fd: %m");
 160         goto fail_vfio;
 161     }
 162
 163     /* Let'em rip */
 164     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 165
 166     vdev->intx.kvm_accel = true;
 167
 168     trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
 169
 170     return;
 171
 172 fail_vfio:
 173     irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
 174     kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
 175 fail_irqfd:
 176     event_notifier_cleanup(&vdev->intx.unmask);
 177 fail:
 178     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 179     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 180 #endif
 181 }
 182
 183 static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
 184 {
 185 #ifdef CONFIG_KVM
 186     struct kvm_irqfd irqfd = {
 187         .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 188         .gsi = vdev->intx.route.irq,
 189         .flags = KVM_IRQFD_FLAG_DEASSIGN,
 190     };
 191
 192     if (!vdev->intx.kvm_accel) {
 193         return;
 194     }
 195
 196     /*
 197      * Get to a known state, hardware masked, QEMU ready to accept new
 198      * interrupts, QEMU IRQ de-asserted.
 199      */
 200     vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 201     vdev->intx.pending = false;
 202     pci_irq_deassert(&vdev->pdev);
 203
 204     /* Tell KVM to stop listening for an INTx irqfd */
 205     if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 206         error_report("vfio: Error: Failed to disable INTx irqfd: %m");
 207     }
 208
 209     /* We only need to close the eventfd for VFIO to cleanup the kernel side */
 210     event_notifier_cleanup(&vdev->intx.unmask);
 211
 212     /* QEMU starts listening for interrupt events. */
 213     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 214
 215     vdev->intx.kvm_accel = false;
 216
 217     /* If we've missed an event, let it re-fire through QEMU */
 218     vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 219
 220     trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
 221 #endif
 222 }
 223
 224 static void vfio_intx_update(PCIDevice *pdev)
 225 {
 226     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 227     PCIINTxRoute route;
 228
 229     if (vdev->interrupt != VFIO_INT_INTx) {
 230         return;
 231     }
 232
 233     route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
 234
 235     if (!pci_intx_route_changed(&vdev->intx.route, &route)) {
 236         return; /* Nothing changed */
 237     }
 238
 239     trace_vfio_intx_update(vdev->vbasedev.name,
 240                            vdev->intx.route.irq, route.irq);
 241
 242     vfio_intx_disable_kvm(vdev);
 243
 244     vdev->intx.route = route;
 245
 246     if (route.mode != PCI_INTX_ENABLED) {
 247         return;
 248     }
 249
 250     vfio_intx_enable_kvm(vdev);
 251
 252     /* Re-enable the interrupt in cased we missed an EOI */
 253     vfio_intx_eoi(&vdev->vbasedev);
 254 }
 255
 256 static int vfio_intx_enable(VFIOPCIDevice *vdev)
 257 {
 258     uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
 259     int ret, argsz;
 260     struct vfio_irq_set *irq_set;
 261     int32_t *pfd;
 262
 263     if (!pin) {
 264         return 0;
 265     }
 266
 267     vfio_disable_interrupts(vdev);
 268
 269     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
 270     pci_config_set_interrupt_pin(vdev->pdev.config, pin);
 271
 272 #ifdef CONFIG_KVM
 273     /*
 274      * Only conditional to avoid generating error messages on platforms
 275      * where we won't actually use the result anyway.
 276      */
 277     if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
 278         vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
 279                                                         vdev->intx.pin);
 280     }
 281 #endif
 282
 283     ret = event_notifier_init(&vdev->intx.interrupt, 0);
 284     if (ret) {
 285         error_report("vfio: Error: event_notifier_init failed");
 286         return ret;
 287     }
 288
 289     argsz = sizeof(*irq_set) + sizeof(*pfd);
 290
 291     irq_set = g_malloc0(argsz);
 292     irq_set->argsz = argsz;
 293     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 294     irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 295     irq_set->start = 0;
 296     irq_set->count = 1;
 297     pfd = (int32_t *)&irq_set->data;
 298
 299     *pfd = event_notifier_get_fd(&vdev->intx.interrupt);
 300     qemu_set_fd_handler(*pfd, vfio_intx_interrupt, NULL, vdev);
 301
 302     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 303     g_free(irq_set);
 304     if (ret) {
 305         error_report("vfio: Error: Failed to setup INTx fd: %m");
 306         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
 307         event_notifier_cleanup(&vdev->intx.interrupt);
 308         return -errno;
 309     }
 310
 311     vfio_intx_enable_kvm(vdev);
 312
 313     vdev->interrupt = VFIO_INT_INTx;
 314
 315     trace_vfio_intx_enable(vdev->vbasedev.name);
 316
 317     return 0;
 318 }
 319
 320 static void vfio_intx_disable(VFIOPCIDevice *vdev)
 321 {
 322     int fd;
 323
 324     timer_del(vdev->intx.mmap_timer);
 325     vfio_intx_disable_kvm(vdev);
 326     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 327     vdev->intx.pending = false;
 328     pci_irq_deassert(&vdev->pdev);
 329     vfio_mmap_set_enabled(vdev, true);
 330
 331     fd = event_notifier_get_fd(&vdev->intx.interrupt);
 332     qemu_set_fd_handler(fd, NULL, NULL, vdev);
 333     event_notifier_cleanup(&vdev->intx.interrupt);
 334
 335     vdev->interrupt = VFIO_INT_NONE;
 336
 337     trace_vfio_intx_disable(vdev->vbasedev.name);
 338 }
 339
 340 /*
 341  * MSI/X
 342  */
 343 static void vfio_msi_interrupt(void *opaque)
 344 {
 345     VFIOMSIVector *vector = opaque;
 346     VFIOPCIDevice *vdev = vector->vdev;
 347     MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
 348     void (*notify)(PCIDevice *dev, unsigned vector);
 349     MSIMessage msg;
 350     int nr = vector - vdev->msi_vectors;
 351
 352     if (!event_notifier_test_and_clear(&vector->interrupt)) {
 353         return;
 354     }
 355
 356     if (vdev->interrupt == VFIO_INT_MSIX) {
 357         get_msg = msix_get_message;
 358         notify = msix_notify;
 359
 360         /* A masked vector firing needs to use the PBA, enable it */
 361         if (msix_is_masked(&vdev->pdev, nr)) {
 362             set_bit(nr, vdev->msix->pending);
 363             memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
 364             trace_vfio_msix_pba_enable(vdev->vbasedev.name);
 365         }
 366     } else if (vdev->interrupt == VFIO_INT_MSI) {
 367         get_msg = msi_get_message;
 368         notify = msi_notify;
 369     } else {
 370         abort();
 371     }
 372
 373     msg = get_msg(&vdev->pdev, nr);
 374     trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
 375     notify(&vdev->pdev, nr);
 376 }
 377
 378 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
 379 {
 380     struct vfio_irq_set *irq_set;
 381     int ret = 0, i, argsz;
 382     int32_t *fds;
 383
 384     argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 385
 386     irq_set = g_malloc0(argsz);
 387     irq_set->argsz = argsz;
 388     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 389     irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
 390     irq_set->start = 0;
 391     irq_set->count = vdev->nr_vectors;
 392     fds = (int32_t *)&irq_set->data;
 393
 394     for (i = 0; i < vdev->nr_vectors; i++) {
 395         int fd = -1;
 396
 397         /*
 398          * MSI vs MSI-X - The guest has direct access to MSI mask and pending
 399          * bits, therefore we always use the KVM signaling path when setup.
 400          * MSI-X mask and pending bits are emulated, so we want to use the
 401          * KVM signaling path only when configured and unmasked.
 402          */
 403         if (vdev->msi_vectors[i].use) {
 404             if (vdev->msi_vectors[i].virq < 0 ||
 405                 (msix && msix_is_masked(&vdev->pdev, i))) {
 406                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
 407             } else {
 408                 fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
 409             }
 410         }
 411
 412         fds[i] = fd;
 413     }
 414
 415     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 416
 417     g_free(irq_set);
 418
 419     return ret;
 420 }
 421
 422 static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
 423                                   MSIMessage *msg, bool msix)
 424 {
 425     int virq;
 426
 427     if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi) || !msg) {
 428         return;
 429     }
 430
 431     if (event_notifier_init(&vector->kvm_interrupt, 0)) {
 432         return;
 433     }
 434
 435     virq = kvm_irqchip_add_msi_route(kvm_state, *msg, &vdev->pdev);
 436     if (virq < 0) {
 437         event_notifier_cleanup(&vector->kvm_interrupt);
 438         return;
 439     }
 440
 441     if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
 442                                        NULL, virq) < 0) {
 443         kvm_irqchip_release_virq(kvm_state, virq);
 444         event_notifier_cleanup(&vector->kvm_interrupt);
 445         return;
 446     }
 447
 448     vector->virq = virq;
 449 }
 450
 451 static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
 452 {
 453     kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
 454                                           vector->virq);
 455     kvm_irqchip_release_virq(kvm_state, vector->virq);
 456     vector->virq = -1;
 457     event_notifier_cleanup(&vector->kvm_interrupt);
 458 }
 459
 460 static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
 461                                      PCIDevice *pdev)
 462 {
 463     kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
 464 }
 465
 466 static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
 467                                    MSIMessage *msg, IOHandler *handler)
 468 {
 469     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 470     VFIOMSIVector *vector;
 471     int ret;
 472
 473     trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
 474
 475     vector = &vdev->msi_vectors[nr];
 476
 477     if (!vector->use) {
 478         vector->vdev = vdev;
 479         vector->virq = -1;
 480         if (event_notifier_init(&vector->interrupt, 0)) {
 481             error_report("vfio: Error: event_notifier_init failed");
 482         }
 483         vector->use = true;
 484         msix_vector_use(pdev, nr);
 485     }
 486
 487     qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 488                         handler, NULL, vector);
 489
 490     /*
 491      * Attempt to enable route through KVM irqchip,
 492      * default to userspace handling if unavailable.
 493      */
 494     if (vector->virq >= 0) {
 495         if (!msg) {
 496             vfio_remove_kvm_msi_virq(vector);
 497         } else {
 498             vfio_update_kvm_msi_virq(vector, *msg, pdev);
 499         }
 500     } else {
 501         vfio_add_kvm_msi_virq(vdev, vector, msg, true);
 502     }
 503
 504     /*
 505      * We don't want to have the host allocate all possible MSI vectors
 506      * for a device if they're not in use, so we shutdown and incrementally
 507      * increase them as needed.
 508      */
 509     if (vdev->nr_vectors < nr + 1) {
 510         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 511         vdev->nr_vectors = nr + 1;
 512         ret = vfio_enable_vectors(vdev, true);
 513         if (ret) {
 514             error_report("vfio: failed to enable vectors, %d", ret);
 515         }
 516     } else {
 517         int argsz;
 518         struct vfio_irq_set *irq_set;
 519         int32_t *pfd;
 520
 521         argsz = sizeof(*irq_set) + sizeof(*pfd);
 522
 523         irq_set = g_malloc0(argsz);
 524         irq_set->argsz = argsz;
 525         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 526                          VFIO_IRQ_SET_ACTION_TRIGGER;
 527         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 528         irq_set->start = nr;
 529         irq_set->count = 1;
 530         pfd = (int32_t *)&irq_set->data;
 531
 532         if (vector->virq >= 0) {
 533             *pfd = event_notifier_get_fd(&vector->kvm_interrupt);
 534         } else {
 535             *pfd = event_notifier_get_fd(&vector->interrupt);
 536         }
 537
 538         ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 539         g_free(irq_set);
 540         if (ret) {
 541             error_report("vfio: failed to modify vector, %d", ret);
 542         }
 543     }
 544
 545     /* Disable PBA emulation when nothing more is pending. */
 546     clear_bit(nr, vdev->msix->pending);
 547     if (find_first_bit(vdev->msix->pending,
 548                        vdev->nr_vectors) == vdev->nr_vectors) {
 549         memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
 550         trace_vfio_msix_pba_disable(vdev->vbasedev.name);
 551     }
 552
 553     return 0;
 554 }
 555
 556 static int vfio_msix_vector_use(PCIDevice *pdev,
 557                                 unsigned int nr, MSIMessage msg)
 558 {
 559     return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
 560 }
 561
 562 static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 563 {
 564     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
 565     VFIOMSIVector *vector = &vdev->msi_vectors[nr];
 566
 567     trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
 568
 569     /*
 570      * There are still old guests that mask and unmask vectors on every
 571      * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
 572      * the KVM setup in place, simply switch VFIO to use the non-bypass
 573      * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
 574      * core will mask the interrupt and set pending bits, allowing it to
 575      * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
 576      */
 577     if (vector->virq >= 0) {
 578         int argsz;
 579         struct vfio_irq_set *irq_set;
 580         int32_t *pfd;
 581
 582         argsz = sizeof(*irq_set) + sizeof(*pfd);
 583
 584         irq_set = g_malloc0(argsz);
 585         irq_set->argsz = argsz;
 586         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 587                          VFIO_IRQ_SET_ACTION_TRIGGER;
 588         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 589         irq_set->start = nr;
 590         irq_set->count = 1;
 591         pfd = (int32_t *)&irq_set->data;
 592
 593         *pfd = event_notifier_get_fd(&vector->interrupt);
 594
 595         ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 596
 597         g_free(irq_set);
 598     }
 599 }
 600
 601 static void vfio_msix_enable(VFIOPCIDevice *vdev)
 602 {
 603     vfio_disable_interrupts(vdev);
 604
 605     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
 606
 607     vdev->interrupt = VFIO_INT_MSIX;
 608
 609     /*
 610      * Some communication channels between VF & PF or PF & fw rely on the
 611      * physical state of the device and expect that enabling MSI-X from the
 612      * guest enables the same on the host.  When our guest is Linux, the
 613      * guest driver call to pci_enable_msix() sets the enabling bit in the
 614      * MSI-X capability, but leaves the vector table masked.  We therefore
 615      * can't rely on a vector_use callback (from request_irq() in the guest)
 616      * to switch the physical device into MSI-X mode because that may come a
 617      * long time after pci_enable_msix().  This code enables vector 0 with
 618      * triggering to userspace, then immediately release the vector, leaving
 619      * the physical device with no vectors enabled, but MSI-X enabled, just
 620      * like the guest view.
 621      */
 622     vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
 623     vfio_msix_vector_release(&vdev->pdev, 0);
 624
 625     if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
 626                                   vfio_msix_vector_release, NULL)) {
 627         error_report("vfio: msix_set_vector_notifiers failed");
 628     }
 629
 630     trace_vfio_msix_enable(vdev->vbasedev.name);
 631 }
 632
 633 static void vfio_msi_enable(VFIOPCIDevice *vdev)
 634 {
 635     int ret, i;
 636
 637     vfio_disable_interrupts(vdev);
 638
 639     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
 640 retry:
 641     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
 642
 643     for (i = 0; i < vdev->nr_vectors; i++) {
 644         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 645         MSIMessage msg = msi_get_message(&vdev->pdev, i);
 646
 647         vector->vdev = vdev;
 648         vector->virq = -1;
 649         vector->use = true;
 650
 651         if (event_notifier_init(&vector->interrupt, 0)) {
 652             error_report("vfio: Error: event_notifier_init failed");
 653         }
 654
 655         qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 656                             vfio_msi_interrupt, NULL, vector);
 657
 658         /*
 659          * Attempt to enable route through KVM irqchip,
 660          * default to userspace handling if unavailable.
 661          */
 662         vfio_add_kvm_msi_virq(vdev, vector, &msg, false);
 663     }
 664
 665     /* Set interrupt type prior to possible interrupts */
 666     vdev->interrupt = VFIO_INT_MSI;
 667
 668     ret = vfio_enable_vectors(vdev, false);
 669     if (ret) {
 670         if (ret < 0) {
 671             error_report("vfio: Error: Failed to setup MSI fds: %m");
 672         } else if (ret != vdev->nr_vectors) {
 673             error_report("vfio: Error: Failed to enable %d "
 674                          "MSI vectors, retry with %d", vdev->nr_vectors, ret);
 675         }
 676
 677         for (i = 0; i < vdev->nr_vectors; i++) {
 678             VFIOMSIVector *vector = &vdev->msi_vectors[i];
 679             if (vector->virq >= 0) {
 680                 vfio_remove_kvm_msi_virq(vector);
 681             }
 682             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 683                                 NULL, NULL, NULL);
 684             event_notifier_cleanup(&vector->interrupt);
 685         }
 686
 687         g_free(vdev->msi_vectors);
 688
 689         if (ret > 0 && ret != vdev->nr_vectors) {
 690             vdev->nr_vectors = ret;
 691             goto retry;
 692         }
 693         vdev->nr_vectors = 0;
 694
 695         /*
 696          * Failing to setup MSI doesn't really fall within any specification.
 697          * Let's try leaving interrupts disabled and hope the guest figures
 698          * out to fall back to INTx for this device.
 699          */
 700         error_report("vfio: Error: Failed to enable MSI");
 701         vdev->interrupt = VFIO_INT_NONE;
 702
 703         return;
 704     }
 705
 706     trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
 707 }
 708
 709 static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
 710 {
 711     int i;
 712
 713     for (i = 0; i < vdev->nr_vectors; i++) {
 714         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 715         if (vdev->msi_vectors[i].use) {
 716             if (vector->virq >= 0) {
 717                 vfio_remove_kvm_msi_virq(vector);
 718             }
 719             qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 720                                 NULL, NULL, NULL);
 721             event_notifier_cleanup(&vector->interrupt);
 722         }
 723     }
 724
 725     g_free(vdev->msi_vectors);
 726     vdev->msi_vectors = NULL;
 727     vdev->nr_vectors = 0;
 728     vdev->interrupt = VFIO_INT_NONE;
 729
 730     vfio_intx_enable(vdev);
 731 }
 732
 733 static void vfio_msix_disable(VFIOPCIDevice *vdev)
 734 {
 735     int i;
 736
 737     msix_unset_vector_notifiers(&vdev->pdev);
 738
 739     /*
 740      * MSI-X will only release vectors if MSI-X is still enabled on the
 741      * device, check through the rest and release it ourselves if necessary.
 742      */
 743     for (i = 0; i < vdev->nr_vectors; i++) {
 744         if (vdev->msi_vectors[i].use) {
 745             vfio_msix_vector_release(&vdev->pdev, i);
 746             msix_vector_unuse(&vdev->pdev, i);
 747         }
 748     }
 749
 750     if (vdev->nr_vectors) {
 751         vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 752     }
 753
 754     vfio_msi_disable_common(vdev);
 755
 756     memset(vdev->msix->pending, 0,
 757            BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
 758
 759     trace_vfio_msix_disable(vdev->vbasedev.name);
 760 }
 761
 762 static void vfio_msi_disable(VFIOPCIDevice *vdev)
 763 {
 764     vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
 765     vfio_msi_disable_common(vdev);
 766
 767     trace_vfio_msi_disable(vdev->vbasedev.name);
 768 }
 769
 770 static void vfio_update_msi(VFIOPCIDevice *vdev)
 771 {
 772     int i;
 773
 774     for (i = 0; i < vdev->nr_vectors; i++) {
 775         VFIOMSIVector *vector = &vdev->msi_vectors[i];
 776         MSIMessage msg;
 777
 778         if (!vector->use || vector->virq < 0) {
 779             continue;
 780         }
 781
 782         msg = msi_get_message(&vdev->pdev, i);
 783         vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
 784     }
 785 }
 786
 787 static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
 788 {
 789     struct vfio_region_info reg_info = {
 790         .argsz = sizeof(reg_info),
 791         .index = VFIO_PCI_ROM_REGION_INDEX
 792     };
 793     uint64_t size;
 794     off_t off = 0;
 795     ssize_t bytes;
 796
 797     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info)) {
 798         error_report("vfio: Error getting ROM info: %m");
 799         return;
 800     }
 801
 802     trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info.size,
 803                             (unsigned long)reg_info.offset,
 804                             (unsigned long)reg_info.flags);
 805
 806     vdev->rom_size = size = reg_info.size;
 807     vdev->rom_offset = reg_info.offset;
 808
 809     if (!vdev->rom_size) {
 810         vdev->rom_read_failed = true;
 811         error_report("vfio-pci: Cannot read device rom at "
 812                     "%s", vdev->vbasedev.name);
 813         error_printf("Device option ROM contents are probably invalid "
 814                     "(check dmesg).\nSkip option ROM probe with rombar=0, "
 815                     "or load from file with romfile=\n");
 816         return;
 817     }
 818
 819     vdev->rom = g_malloc(size);
 820     memset(vdev->rom, 0xff, size);
 821
 822     while (size) {
 823         bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
 824                       size, vdev->rom_offset + off);
 825         if (bytes == 0) {
 826             break;
 827         } else if (bytes > 0) {
 828             off += bytes;
 829             size -= bytes;
 830         } else {
 831             if (errno == EINTR || errno == EAGAIN) {
 832                 continue;
 833             }
 834             error_report("vfio: Error reading device ROM: %m");
 835             break;
 836         }
 837     }
 838 }
 839
 840 static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
 841 {
 842     VFIOPCIDevice *vdev = opaque;
 843     union {
 844         uint8_t byte;
 845         uint16_t word;
 846         uint32_t dword;
 847         uint64_t qword;
 848     } val;
 849     uint64_t data = 0;
 850
 851     /* Load the ROM lazily when the guest tries to read it */
 852     if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
 853         vfio_pci_load_rom(vdev);
 854     }
 855
 856     memcpy(&val, vdev->rom + addr,
 857            (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
 858
 859     switch (size) {
 860     case 1:
 861         data = val.byte;
 862         break;
 863     case 2:
 864         data = le16_to_cpu(val.word);
 865         break;
 866     case 4:
 867         data = le32_to_cpu(val.dword);
 868         break;
 869     default:
 870         hw_error("vfio: unsupported read size, %d bytes\n", size);
 871         break;
 872     }
 873
 874     trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
 875
 876     return data;
 877 }
 878
 879 static void vfio_rom_write(void *opaque, hwaddr addr,
 880                            uint64_t data, unsigned size)
 881 {
 882 }
 883
 884 static const MemoryRegionOps vfio_rom_ops = {
 885     .read = vfio_rom_read,
 886     .write = vfio_rom_write,
 887     .endianness = DEVICE_LITTLE_ENDIAN,
 888 };
 889
 890 static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
 891 {
 892     uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
 893     off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
 894     DeviceState *dev = DEVICE(vdev);
 895     char name[32];
 896     int fd = vdev->vbasedev.fd;
 897
 898     if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
 899         /* Since pci handles romfile, just print a message and return */
 900         if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) {
 901             error_printf("Warning : Device at %04x:%02x:%02x.%x "
 902                          "is known to cause system instability issues during "
 903                          "option rom execution. "
 904                          "Proceeding anyway since user specified romfile\n",
 905                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
 906                          vdev->host.function);
 907         }
 908         return;
 909     }
 910
 911     /*
 912      * Use the same size ROM BAR as the physical device.  The contents
 913      * will get filled in later when the guest tries to read it.
 914      */
 915     if (pread(fd, &orig, 4, offset) != 4 ||
 916         pwrite(fd, &size, 4, offset) != 4 ||
 917         pread(fd, &size, 4, offset) != 4 ||
 918         pwrite(fd, &orig, 4, offset) != 4) {
 919         error_report("%s(%04x:%02x:%02x.%x) failed: %m",
 920                      __func__, vdev->host.domain, vdev->host.bus,
 921                      vdev->host.slot, vdev->host.function);
 922         return;
 923     }
 924
 925     size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
 926
 927     if (!size) {
 928         return;
 929     }
 930
 931     if (vfio_blacklist_opt_rom(vdev)) {
 932         if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
 933             error_printf("Warning : Device at %04x:%02x:%02x.%x "
 934                          "is known to cause system instability issues during "
 935                          "option rom execution. "
 936                          "Proceeding anyway since user specified non zero value for "
 937                          "rombar\n",
 938                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
 939                          vdev->host.function);
 940         } else {
 941             error_printf("Warning : Rom loading for device at "
 942                          "%04x:%02x:%02x.%x has been disabled due to "
 943                          "system instability issues. "
 944                          "Specify rombar=1 or romfile to force\n",
 945                          vdev->host.domain, vdev->host.bus, vdev->host.slot,
 946                          vdev->host.function);
 947             return;
 948         }
 949     }
 950
 951     trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
 952
 953     snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
 954              vdev->host.domain, vdev->host.bus, vdev->host.slot,
 955              vdev->host.function);
 956
 957     memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
 958                           &vfio_rom_ops, vdev, name, size);
 959
 960     pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
 961                      PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
 962
 963     vdev->pdev.has_rom = true;
 964     vdev->rom_read_failed = false;
 965 }
 966
 967 void vfio_vga_write(void *opaque, hwaddr addr,
 968                            uint64_t data, unsigned size)
 969 {
 970     VFIOVGARegion *region = opaque;
 971     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
 972     union {
 973         uint8_t byte;
 974         uint16_t word;
 975         uint32_t dword;
 976         uint64_t qword;
 977     } buf;
 978     off_t offset = vga->fd_offset + region->offset + addr;
 979
 980     switch (size) {
 981     case 1:
 982         buf.byte = data;
 983         break;
 984     case 2:
 985         buf.word = cpu_to_le16(data);
 986         break;
 987     case 4:
 988         buf.dword = cpu_to_le32(data);
 989         break;
 990     default:
 991         hw_error("vfio: unsupported write size, %d bytes", size);
 992         break;
 993     }
 994
 995     if (pwrite(vga->fd, &buf, size, offset) != size) {
 996         error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
 997                      __func__, region->offset + addr, data, size);
 998     }
 999
1000     trace_vfio_vga_write(region->offset + addr, data, size);
1001 }
1002
1003 uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1004 {
1005     VFIOVGARegion *region = opaque;
1006     VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1007     union {
1008         uint8_t byte;
1009         uint16_t word;
1010         uint32_t dword;
1011         uint64_t qword;
1012     } buf;
1013     uint64_t data = 0;
1014     off_t offset = vga->fd_offset + region->offset + addr;
1015
1016     if (pread(vga->fd, &buf, size, offset) != size) {
1017         error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1018                      __func__, region->offset + addr, size);
1019         return (uint64_t)-1;
1020     }
1021
1022     switch (size) {
1023     case 1:
1024         data = buf.byte;
1025         break;
1026     case 2:
1027         data = le16_to_cpu(buf.word);
1028         break;
1029     case 4:
1030         data = le32_to_cpu(buf.dword);
1031         break;
1032     default:
1033         hw_error("vfio: unsupported read size, %d bytes", size);
1034         break;
1035     }
1036
1037     trace_vfio_vga_read(region->offset + addr, size, data);
1038
1039     return data;
1040 }
1041
1042 static const MemoryRegionOps vfio_vga_ops = {
1043     .read = vfio_vga_read,
1044     .write = vfio_vga_write,
1045     .endianness = DEVICE_LITTLE_ENDIAN,
1046 };
1047
1048 /*
1049  * PCI config space
1050  */
1051 uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1052 {
1053     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
1054     uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1055
1056     memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1057     emu_bits = le32_to_cpu(emu_bits);
1058
1059     if (emu_bits) {
1060         emu_val = pci_default_read_config(pdev, addr, len);
1061     }
1062
1063     if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1064         ssize_t ret;
1065
1066         ret = pread(vdev->vbasedev.fd, &phys_val, len,
1067                     vdev->config_offset + addr);
1068         if (ret != len) {
1069             error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m",
1070                          __func__, vdev->host.domain, vdev->host.bus,
1071                          vdev->host.slot, vdev->host.function, addr, len);
1072             return -errno;
1073         }
1074         phys_val = le32_to_cpu(phys_val);
1075     }
1076
1077     val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1078
1079     trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1080
1081     return val;
1082 }
1083
1084 void vfio_pci_write_config(PCIDevice *pdev,
1085                            uint32_t addr, uint32_t val, int len)
1086 {
1087     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
1088     uint32_t val_le = cpu_to_le32(val);
1089
1090     trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1091
1092     /* Write everything to VFIO, let it filter out what we can't write */
1093     if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
1094                 != len) {
1095         error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m",
1096                      __func__, vdev->host.domain, vdev->host.bus,
1097                      vdev->host.slot, vdev->host.function, addr, val, len);
1098     }
1099
1100     /* MSI/MSI-X Enabling/Disabling */
1101     if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1102         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1103         int is_enabled, was_enabled = msi_enabled(pdev);
1104
1105         pci_default_write_config(pdev, addr, val, len);
1106
1107         is_enabled = msi_enabled(pdev);
1108
1109         if (!was_enabled) {
1110             if (is_enabled) {
1111                 vfio_msi_enable(vdev);
1112             }
1113         } else {
1114             if (!is_enabled) {
1115                 vfio_msi_disable(vdev);
1116             } else {
1117                 vfio_update_msi(vdev);
1118             }
1119         }
1120     } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1121         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1122         int is_enabled, was_enabled = msix_enabled(pdev);
1123
1124         pci_default_write_config(pdev, addr, val, len);
1125
1126         is_enabled = msix_enabled(pdev);
1127
1128         if (!was_enabled && is_enabled) {
1129             vfio_msix_enable(vdev);
1130         } else if (was_enabled && !is_enabled) {
1131             vfio_msix_disable(vdev);
1132         }
1133     } else {
1134         /* Write everything to QEMU to keep emulated bits correct */
1135         pci_default_write_config(pdev, addr, val, len);
1136     }
1137 }
1138
1139 /*
1140  * Interrupt setup
1141  */
1142 static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1143 {
1144     /*
1145      * More complicated than it looks.  Disabling MSI/X transitions the
1146      * device to INTx mode (if supported).  Therefore we need to first
1147      * disable MSI/X and then cleanup by disabling INTx.
1148      */
1149     if (vdev->interrupt == VFIO_INT_MSIX) {
1150         vfio_msix_disable(vdev);
1151     } else if (vdev->interrupt == VFIO_INT_MSI) {
1152         vfio_msi_disable(vdev);
1153     }
1154
1155     if (vdev->interrupt == VFIO_INT_INTx) {
1156         vfio_intx_disable(vdev);
1157     }
1158 }
1159
1160 static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos)
1161 {
1162     uint16_t ctrl;
1163     bool msi_64bit, msi_maskbit;
1164     int ret, entries;
1165
1166     if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
1167               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
1168         return -errno;
1169     }
1170     ctrl = le16_to_cpu(ctrl);
1171
1172     msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1173     msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1174     entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1175
1176     trace_vfio_msi_setup(vdev->vbasedev.name, pos);
1177
1178     ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
1179     if (ret < 0) {
1180         if (ret == -ENOTSUP) {
1181             return 0;
1182         }
1183         error_report("vfio: msi_init failed");
1184         return ret;
1185     }
1186     vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1187
1188     return 0;
1189 }
1190
1191 /*
1192  * We don't have any control over how pci_add_capability() inserts
1193  * capabilities into the chain.  In order to setup MSI-X we need a
1194  * MemoryRegion for the BAR.  In order to setup the BAR and not
1195  * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1196  * need to first look for where the MSI-X table lives.  So we
1197  * unfortunately split MSI-X setup across two functions.
1198  */
1199 static int vfio_msix_early_setup(VFIOPCIDevice *vdev)
1200 {
1201     uint8_t pos;
1202     uint16_t ctrl;
1203     uint32_t table, pba;
1204     int fd = vdev->vbasedev.fd;
1205     VFIOMSIXInfo *msix;
1206
1207     pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1208     if (!pos) {
1209         return 0;
1210     }
1211
1212     if (pread(fd, &ctrl, sizeof(ctrl),
1213               vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
1214         return -errno;
1215     }
1216
1217     if (pread(fd, &table, sizeof(table),
1218               vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
1219         return -errno;
1220     }
1221
1222     if (pread(fd, &pba, sizeof(pba),
1223               vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
1224         return -errno;
1225     }
1226
1227     ctrl = le16_to_cpu(ctrl);
1228     table = le32_to_cpu(table);
1229     pba = le32_to_cpu(pba);
1230
1231     msix = g_malloc0(sizeof(*msix));
1232     msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1233     msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1234     msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1235     msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1236     msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1237
1238     /*
1239      * Test the size of the pba_offset variable and catch if it extends outside
1240      * of the specified BAR. If it is the case, we need to apply a hardware
1241      * specific quirk if the device is known or we have a broken configuration.
1242      */
1243     if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
1244         /*
1245          * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
1246          * adapters. The T5 hardware returns an incorrect value of 0x8000 for
1247          * the VF PBA offset while the BAR itself is only 8k. The correct value
1248          * is 0x1000, so we hard code that here.
1249          */
1250         if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
1251             (vdev->device_id & 0xff00) == 0x5800) {
1252             msix->pba_offset = 0x1000;
1253         } else {
1254             error_report("vfio: Hardware reports invalid configuration, "
1255                          "MSIX PBA outside of specified BAR");
1256             g_free(msix);
1257             return -EINVAL;
1258         }
1259     }
1260
1261     trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
1262                                 msix->table_offset, msix->entries);
1263     vdev->msix = msix;
1264
1265     return 0;
1266 }
1267
1268 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos)
1269 {
1270     int ret;
1271
1272     vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
1273                                     sizeof(unsigned long));
1274     ret = msix_init(&vdev->pdev, vdev->msix->entries,
1275                     &vdev->bars[vdev->msix->table_bar].region.mem,
1276                     vdev->msix->table_bar, vdev->msix->table_offset,
1277                     &vdev->bars[vdev->msix->pba_bar].region.mem,
1278                     vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
1279     if (ret < 0) {
1280         if (ret == -ENOTSUP) {
1281             return 0;
1282         }
1283         error_report("vfio: msix_init failed");
1284         return ret;
1285     }
1286
1287     /*
1288      * The PCI spec suggests that devices provide additional alignment for
1289      * MSI-X structures and avoid overlapping non-MSI-X related registers.
1290      * For an assigned device, this hopefully means that emulation of MSI-X
1291      * structures does not affect the performance of the device.  If devices
1292      * fail to provide that alignment, a significant performance penalty may
1293      * result, for instance Mellanox MT27500 VFs:
1294      * http://www.spinics.net/lists/kvm/msg125881.html
1295      *
1296      * The PBA is simply not that important for such a serious regression and
1297      * most drivers do not appear to look at it.  The solution for this is to
1298      * disable the PBA MemoryRegion unless it's being used.  We disable it
1299      * here and only enable it if a masked vector fires through QEMU.  As the
1300      * vector-use notifier is called, which occurs on unmask, we test whether
1301      * PBA emulation is needed and again disable if not.
1302      */
1303     memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
1304
1305     return 0;
1306 }
1307
1308 static void vfio_teardown_msi(VFIOPCIDevice *vdev)
1309 {
1310     msi_uninit(&vdev->pdev);
1311
1312     if (vdev->msix) {
1313         msix_uninit(&vdev->pdev,
1314                     &vdev->bars[vdev->msix->table_bar].region.mem,
1315                     &vdev->bars[vdev->msix->pba_bar].region.mem);
1316         g_free(vdev->msix->pending);
1317     }
1318 }
1319
1320 /*
1321  * Resource setup
1322  */
1323 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
1324 {
1325     int i;
1326
1327     for (i = 0; i < PCI_ROM_SLOT; i++) {
1328         VFIOBAR *bar = &vdev->bars[i];
1329
1330         if (!bar->region.size) {
1331             continue;
1332         }
1333
1334         memory_region_set_enabled(&bar->region.mmap_mem, enabled);
1335         if (vdev->msix && vdev->msix->table_bar == i) {
1336             memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
1337         }
1338     }
1339 }
1340
1341 static void vfio_unregister_bar(VFIOPCIDevice *vdev, int nr)
1342 {
1343     VFIOBAR *bar = &vdev->bars[nr];
1344
1345     if (!bar->region.size) {
1346         return;
1347     }
1348
1349     vfio_bar_quirk_teardown(vdev, nr);
1350
1351     memory_region_del_subregion(&bar->region.mem, &bar->region.mmap_mem);
1352
1353     if (vdev->msix && vdev->msix->table_bar == nr) {
1354         memory_region_del_subregion(&bar->region.mem, &vdev->msix->mmap_mem);
1355     }
1356 }
1357
1358 static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr)
1359 {
1360     VFIOBAR *bar = &vdev->bars[nr];
1361
1362     if (!bar->region.size) {
1363         return;
1364     }
1365
1366     vfio_bar_quirk_free(vdev, nr);
1367
1368     munmap(bar->region.mmap, memory_region_size(&bar->region.mmap_mem));
1369
1370     if (vdev->msix && vdev->msix->table_bar == nr) {
1371         munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
1372     }
1373 }
1374
1375 static void vfio_map_bar(VFIOPCIDevice *vdev, int nr)
1376 {
1377     VFIOBAR *bar = &vdev->bars[nr];
1378     uint64_t size = bar->region.size;
1379     char name[64];
1380     uint32_t pci_bar;
1381     uint8_t type;
1382     int ret;
1383
1384     /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
1385     if (!size) {
1386         return;
1387     }
1388
1389     snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
1390              vdev->host.domain, vdev->host.bus, vdev->host.slot,
1391              vdev->host.function, nr);
1392
1393     /* Determine what type of BAR this is for registration */
1394     ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
1395                 vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
1396     if (ret != sizeof(pci_bar)) {
1397         error_report("vfio: Failed to read BAR %d (%m)", nr);
1398         return;
1399     }
1400
1401     pci_bar = le32_to_cpu(pci_bar);
1402     bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
1403     bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
1404     type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
1405                                     ~PCI_BASE_ADDRESS_MEM_MASK);
1406
1407     /* A "slow" read/write mapping underlies all BARs */
1408     memory_region_init_io(&bar->region.mem, OBJECT(vdev), &vfio_region_ops,
1409                           bar, name, size);
1410     pci_register_bar(&vdev->pdev, nr, type, &bar->region.mem);
1411
1412     /*
1413      * We can't mmap areas overlapping the MSIX vector table, so we
1414      * potentially insert a direct-mapped subregion before and after it.
1415      */
1416     if (vdev->msix && vdev->msix->table_bar == nr) {
1417         size = vdev->msix->table_offset & qemu_real_host_page_mask;
1418     }
1419
1420     strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
1421     if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem,
1422                       &bar->region.mmap_mem, &bar->region.mmap,
1423                       size, 0, name)) {
1424         error_report("%s unsupported. Performance may be slow", name);
1425     }
1426
1427     if (vdev->msix && vdev->msix->table_bar == nr) {
1428         uint64_t start;
1429
1430         start = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
1431                                      (vdev->msix->entries *
1432                                       PCI_MSIX_ENTRY_SIZE));
1433
1434         size = start < bar->region.size ? bar->region.size - start : 0;
1435         strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
1436         /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
1437         if (vfio_mmap_region(OBJECT(vdev), &bar->region, &bar->region.mem,
1438                           &vdev->msix->mmap_mem,
1439                           &vdev->msix->mmap, size, start, name)) {
1440             error_report("%s unsupported. Performance may be slow", name);
1441         }
1442     }
1443
1444     vfio_bar_quirk_setup(vdev, nr);
1445 }
1446
1447 static void vfio_map_bars(VFIOPCIDevice *vdev)
1448 {
1449     int i;
1450
1451     for (i = 0; i < PCI_ROM_SLOT; i++) {
1452         vfio_map_bar(vdev, i);
1453     }
1454
1455     if (vdev->has_vga) {
1456         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
1457                               OBJECT(vdev), &vfio_vga_ops,
1458                               &vdev->vga.region[QEMU_PCI_VGA_MEM],
1459                               "vfio-vga-mmio@0xa0000",
1460                               QEMU_PCI_VGA_MEM_SIZE);
1461         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
1462                               OBJECT(vdev), &vfio_vga_ops,
1463                               &vdev->vga.region[QEMU_PCI_VGA_IO_LO],
1464                               "vfio-vga-io@0x3b0",
1465                               QEMU_PCI_VGA_IO_LO_SIZE);
1466         memory_region_init_io(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem,
1467                               OBJECT(vdev), &vfio_vga_ops,
1468                               &vdev->vga.region[QEMU_PCI_VGA_IO_HI],
1469                               "vfio-vga-io@0x3c0",
1470                               QEMU_PCI_VGA_IO_HI_SIZE);
1471
1472         pci_register_vga(&vdev->pdev, &vdev->vga.region[QEMU_PCI_VGA_MEM].mem,
1473                          &vdev->vga.region[QEMU_PCI_VGA_IO_LO].mem,
1474                          &vdev->vga.region[QEMU_PCI_VGA_IO_HI].mem);
1475         vfio_vga_quirk_setup(vdev);
1476     }
1477 }
1478
1479 static void vfio_unregister_bars(VFIOPCIDevice *vdev)
1480 {
1481     int i;
1482
1483     for (i = 0; i < PCI_ROM_SLOT; i++) {
1484         vfio_unregister_bar(vdev, i);
1485     }
1486
1487     if (vdev->has_vga) {
1488         vfio_vga_quirk_teardown(vdev);
1489         pci_unregister_vga(&vdev->pdev);
1490     }
1491 }
1492
1493 static void vfio_unmap_bars(VFIOPCIDevice *vdev)
1494 {
1495     int i;
1496
1497     for (i = 0; i < PCI_ROM_SLOT; i++) {
1498         vfio_unmap_bar(vdev, i);
1499     }
1500
1501     if (vdev->has_vga) {
1502         vfio_vga_quirk_free(vdev);
1503     }
1504 }
1505
1506 /*
1507  * General setup
1508  */
1509 static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
1510 {
1511     uint8_t tmp, next = 0xff;
1512
1513     for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
1514          tmp = pdev->config[tmp + 1]) {
1515         if (tmp > pos && tmp < next) {
1516             next = tmp;
1517         }
1518     }
1519
1520     return next - pos;
1521 }
1522
1523 static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
1524 {
1525     pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
1526 }
1527
1528 static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
1529                                    uint16_t val, uint16_t mask)
1530 {
1531     vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
1532     vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
1533     vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
1534 }
1535
1536 static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
1537 {
1538     pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
1539 }
1540
1541 static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
1542                                    uint32_t val, uint32_t mask)
1543 {
1544     vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
1545     vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
1546     vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
1547 }
1548
1549 static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
1550 {
1551     uint16_t flags;
1552     uint8_t type;
1553
1554     flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
1555     type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
1556
1557     if (type != PCI_EXP_TYPE_ENDPOINT &&
1558         type != PCI_EXP_TYPE_LEG_END &&
1559         type != PCI_EXP_TYPE_RC_END) {
1560
1561         error_report("vfio: Assignment of PCIe type 0x%x "
1562                      "devices is not currently supported", type);
1563         return -EINVAL;
1564     }
1565
1566     if (!pci_bus_is_express(vdev->pdev.bus)) {
1567         PCIBus *bus = vdev->pdev.bus;
1568         PCIDevice *bridge;
1569
1570         /*
1571          * Traditionally PCI device assignment exposes the PCIe capability
1572          * as-is on non-express buses.  The reason being that some drivers
1573          * simply assume that it's there, for example tg3.  However when
1574          * we're running on a native PCIe machine type, like Q35, we need
1575          * to hide the PCIe capability.  The reason for this is twofold;
1576          * first Windows guests get a Code 10 error when the PCIe capability
1577          * is exposed in this configuration.  Therefore express devices won't
1578          * work at all unless they're attached to express buses in the VM.
1579          * Second, a native PCIe machine introduces the possibility of fine
1580          * granularity IOMMUs supporting both translation and isolation.
1581          * Guest code to discover the IOMMU visibility of a device, such as
1582          * IOMMU grouping code on Linux, is very aware of device types and
1583          * valid transitions between bus types.  An express device on a non-
1584          * express bus is not a valid combination on bare metal systems.
1585          *
1586          * Drivers that require a PCIe capability to make the device
1587          * functional are simply going to need to have their devices placed
1588          * on a PCIe bus in the VM.
1589          */
1590         while (!pci_bus_is_root(bus)) {
1591             bridge = pci_bridge_get_device(bus);
1592             bus = bridge->bus;
1593         }
1594
1595         if (pci_bus_is_express(bus)) {
1596             return 0;
1597         }
1598
1599     } else if (pci_bus_is_root(vdev->pdev.bus)) {
1600         /*
1601          * On a Root Complex bus Endpoints become Root Complex Integrated
1602          * Endpoints, which changes the type and clears the LNK & LNK2 fields.
1603          */
1604         if (type == PCI_EXP_TYPE_ENDPOINT) {
1605             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
1606                                    PCI_EXP_TYPE_RC_END << 4,
1607                                    PCI_EXP_FLAGS_TYPE);
1608
1609             /* Link Capabilities, Status, and Control goes away */
1610             if (size > PCI_EXP_LNKCTL) {
1611                 vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
1612                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
1613                 vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
1614
1615 #ifndef PCI_EXP_LNKCAP2
1616 #define PCI_EXP_LNKCAP2 44
1617 #endif
1618 #ifndef PCI_EXP_LNKSTA2
1619 #define PCI_EXP_LNKSTA2 50
1620 #endif
1621                 /* Link 2 Capabilities, Status, and Control goes away */
1622                 if (size > PCI_EXP_LNKCAP2) {
1623                     vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
1624                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
1625                     vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
1626                 }
1627             }
1628
1629         } else if (type == PCI_EXP_TYPE_LEG_END) {
1630             /*
1631              * Legacy endpoints don't belong on the root complex.  Windows
1632              * seems to be happier with devices if we skip the capability.
1633              */
1634             return 0;
1635         }
1636
1637     } else {
1638         /*
1639          * Convert Root Complex Integrated Endpoints to regular endpoints.
1640          * These devices don't support LNK/LNK2 capabilities, so make them up.
1641          */
1642         if (type == PCI_EXP_TYPE_RC_END) {
1643             vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
1644                                    PCI_EXP_TYPE_ENDPOINT << 4,
1645                                    PCI_EXP_FLAGS_TYPE);
1646             vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
1647                                    PCI_EXP_LNK_MLW_1 | PCI_EXP_LNK_LS_25, ~0);
1648             vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
1649         }
1650
1651         /* Mark the Link Status bits as emulated to allow virtual negotiation */
1652         vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA,
1653                                pci_get_word(vdev->pdev.config + pos +
1654                                             PCI_EXP_LNKSTA),
1655                                PCI_EXP_LNKCAP_MLW | PCI_EXP_LNKCAP_SLS);
1656     }
1657
1658     pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size);
1659     if (pos >= 0) {
1660         vdev->pdev.exp.exp_cap = pos;
1661     }
1662
1663     return pos;
1664 }
1665
1666 static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
1667 {
1668     uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
1669
1670     if (cap & PCI_EXP_DEVCAP_FLR) {
1671         trace_vfio_check_pcie_flr(vdev->vbasedev.name);
1672         vdev->has_flr = true;
1673     }
1674 }
1675
1676 static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
1677 {
1678     uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
1679
1680     if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
1681         trace_vfio_check_pm_reset(vdev->vbasedev.name);
1682         vdev->has_pm_reset = true;
1683     }
1684 }
1685
1686 static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
1687 {
1688     uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
1689
1690     if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
1691         trace_vfio_check_af_flr(vdev->vbasedev.name);
1692         vdev->has_flr = true;
1693     }
1694 }
1695
1696 static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos)
1697 {
1698     PCIDevice *pdev = &vdev->pdev;
1699     uint8_t cap_id, next, size;
1700     int ret;
1701
1702     cap_id = pdev->config[pos];
1703     next = pdev->config[pos + 1];
1704
1705     /*
1706      * If it becomes important to configure capabilities to their actual
1707      * size, use this as the default when it's something we don't recognize.
1708      * Since QEMU doesn't actually handle many of the config accesses,
1709      * exact size doesn't seem worthwhile.
1710      */
1711     size = vfio_std_cap_max_size(pdev, pos);
1712
1713     /*
1714      * pci_add_capability always inserts the new capability at the head
1715      * of the chain.  Therefore to end up with a chain that matches the
1716      * physical device, we insert from the end by making this recursive.
1717      * This is also why we pre-caclulate size above as cached config space
1718      * will be changed as we unwind the stack.
1719      */
1720     if (next) {
1721         ret = vfio_add_std_cap(vdev, next);
1722         if (ret) {
1723             return ret;
1724         }
1725     } else {
1726         /* Begin the rebuild, use QEMU emulated list bits */
1727         pdev->config[PCI_CAPABILITY_LIST] = 0;
1728         vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
1729         vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
1730     }
1731
1732     /* Use emulated next pointer to allow dropping caps */
1733     pci_set_byte(vdev->emulated_config_bits + pos + 1, 0xff);
1734
1735     switch (cap_id) {
1736     case PCI_CAP_ID_MSI:
1737         ret = vfio_msi_setup(vdev, pos);
1738         break;
1739     case PCI_CAP_ID_EXP:
1740         vfio_check_pcie_flr(vdev, pos);
1741         ret = vfio_setup_pcie_cap(vdev, pos, size);
1742         break;
1743     case PCI_CAP_ID_MSIX:
1744         ret = vfio_msix_setup(vdev, pos);
1745         break;
1746     case PCI_CAP_ID_PM:
1747         vfio_check_pm_reset(vdev, pos);
1748         vdev->pm_cap = pos;
1749         ret = pci_add_capability(pdev, cap_id, pos, size);
1750         break;
1751     case PCI_CAP_ID_AF:
1752         vfio_check_af_flr(vdev, pos);
1753         ret = pci_add_capability(pdev, cap_id, pos, size);
1754         break;
1755     default:
1756         ret = pci_add_capability(pdev, cap_id, pos, size);
1757         break;
1758     }
1759
1760     if (ret < 0) {
1761         error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
1762                      "0x%x[0x%x]@0x%x: %d", vdev->host.domain,
1763                      vdev->host.bus, vdev->host.slot, vdev->host.function,
1764                      cap_id, size, pos, ret);
1765         return ret;
1766     }
1767
1768     return 0;
1769 }
1770
1771 static int vfio_add_capabilities(VFIOPCIDevice *vdev)
1772 {
1773     PCIDevice *pdev = &vdev->pdev;
1774
1775     if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
1776         !pdev->config[PCI_CAPABILITY_LIST]) {
1777         return 0; /* Nothing to add */
1778     }
1779
1780     return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
1781 }
1782
1783 static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
1784 {
1785     PCIDevice *pdev = &vdev->pdev;
1786     uint16_t cmd;
1787
1788     vfio_disable_interrupts(vdev);
1789
1790     /* Make sure the device is in D0 */
1791     if (vdev->pm_cap) {
1792         uint16_t pmcsr;
1793         uint8_t state;
1794
1795         pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
1796         state = pmcsr & PCI_PM_CTRL_STATE_MASK;
1797         if (state) {
1798             pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
1799             vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
1800             /* vfio handles the necessary delay here */
1801             pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
1802             state = pmcsr & PCI_PM_CTRL_STATE_MASK;
1803             if (state) {
1804                 error_report("vfio: Unable to power on device, stuck in D%d",
1805                              state);
1806             }
1807         }
1808     }
1809
1810     /*
1811      * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
1812      * Also put INTx Disable in known state.
1813      */
1814     cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
1815     cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
1816              PCI_COMMAND_INTX_DISABLE);
1817     vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
1818 }
1819
1820 static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
1821 {
1822     vfio_intx_enable(vdev);
1823 }
1824
1825 static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
1826                                 PCIHostDeviceAddress *host2)
1827 {
1828     return (host1->domain == host2->domain && host1->bus == host2->bus &&
1829             host1->slot == host2->slot && host1->function == host2->function);
1830 }
1831
1832 static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
1833 {
1834     VFIOGroup *group;
1835     struct vfio_pci_hot_reset_info *info;
1836     struct vfio_pci_dependent_device *devices;
1837     struct vfio_pci_hot_reset *reset;
1838     int32_t *fds;
1839     int ret, i, count;
1840     bool multi = false;
1841
1842     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
1843
1844     vfio_pci_pre_reset(vdev);
1845     vdev->vbasedev.needs_reset = false;
1846
1847     info = g_malloc0(sizeof(*info));
1848     info->argsz = sizeof(*info);
1849
1850     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
1851     if (ret && errno != ENOSPC) {
1852         ret = -errno;
1853         if (!vdev->has_pm_reset) {
1854             error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
1855                          "no available reset mechanism.", vdev->host.domain,
1856                          vdev->host.bus, vdev->host.slot, vdev->host.function);
1857         }
1858         goto out_single;
1859     }
1860
1861     count = info->count;
1862     info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
1863     info->argsz = sizeof(*info) + (count * sizeof(*devices));
1864     devices = &info->devices[0];
1865
1866     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
1867     if (ret) {
1868         ret = -errno;
1869         error_report("vfio: hot reset info failed: %m");
1870         goto out_single;
1871     }
1872
1873     trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
1874
1875     /* Verify that we have all the groups required */
1876     for (i = 0; i < info->count; i++) {
1877         PCIHostDeviceAddress host;
1878         VFIOPCIDevice *tmp;
1879         VFIODevice *vbasedev_iter;
1880
1881         host.domain = devices[i].segment;
1882         host.bus = devices[i].bus;
1883         host.slot = PCI_SLOT(devices[i].devfn);
1884         host.function = PCI_FUNC(devices[i].devfn);
1885
1886         trace_vfio_pci_hot_reset_dep_devices(host.domain,
1887                 host.bus, host.slot, host.function, devices[i].group_id);
1888
1889         if (vfio_pci_host_match(&host, &vdev->host)) {
1890             continue;
1891         }
1892
1893         QLIST_FOREACH(group, &vfio_group_list, next) {
1894             if (group->groupid == devices[i].group_id) {
1895                 break;
1896             }
1897         }
1898
1899         if (!group) {
1900             if (!vdev->has_pm_reset) {
1901                 error_report("vfio: Cannot reset device %s, "
1902                              "depends on group %d which is not owned.",
1903                              vdev->vbasedev.name, devices[i].group_id);
1904             }
1905             ret = -EPERM;
1906             goto out;
1907         }
1908
1909         /* Prep dependent devices for reset and clear our marker. */
1910         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
1911             if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
1912                 continue;
1913             }
1914             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
1915             if (vfio_pci_host_match(&host, &tmp->host)) {
1916                 if (single) {
1917                     ret = -EINVAL;
1918                     goto out_single;
1919                 }
1920                 vfio_pci_pre_reset(tmp);
1921                 tmp->vbasedev.needs_reset = false;
1922                 multi = true;
1923                 break;
1924             }
1925         }
1926     }
1927
1928     if (!single && !multi) {
1929         ret = -EINVAL;
1930         goto out_single;
1931     }
1932
1933     /* Determine how many group fds need to be passed */
1934     count = 0;
1935     QLIST_FOREACH(group, &vfio_group_list, next) {
1936         for (i = 0; i < info->count; i++) {
1937             if (group->groupid == devices[i].group_id) {
1938                 count++;
1939                 break;
1940             }
1941         }
1942     }
1943
1944     reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
1945     reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
1946     fds = &reset->group_fds[0];
1947
1948     /* Fill in group fds */
1949     QLIST_FOREACH(group, &vfio_group_list, next) {
1950         for (i = 0; i < info->count; i++) {
1951             if (group->groupid == devices[i].group_id) {
1952                 fds[reset->count++] = group->fd;
1953                 break;
1954             }
1955         }
1956     }
1957
1958     /* Bus reset! */
1959     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
1960     g_free(reset);
1961
1962     trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
1963                                     ret ? "%m" : "Success");
1964
1965 out:
1966     /* Re-enable INTx on affected devices */
1967     for (i = 0; i < info->count; i++) {
1968         PCIHostDeviceAddress host;
1969         VFIOPCIDevice *tmp;
1970         VFIODevice *vbasedev_iter;
1971
1972         host.domain = devices[i].segment;
1973         host.bus = devices[i].bus;
1974         host.slot = PCI_SLOT(devices[i].devfn);
1975         host.function = PCI_FUNC(devices[i].devfn);
1976
1977         if (vfio_pci_host_match(&host, &vdev->host)) {
1978             continue;
1979         }
1980
1981         QLIST_FOREACH(group, &vfio_group_list, next) {
1982             if (group->groupid == devices[i].group_id) {
1983                 break;
1984             }
1985         }
1986
1987         if (!group) {
1988             break;
1989         }
1990
1991         QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
1992             if (vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
1993                 continue;
1994             }
1995             tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
1996             if (vfio_pci_host_match(&host, &tmp->host)) {
1997                 vfio_pci_post_reset(tmp);
1998                 break;
1999             }
2000         }
2001     }
2002 out_single:
2003     vfio_pci_post_reset(vdev);
2004     g_free(info);
2005
2006     return ret;
2007 }
2008
2009 /*
2010  * We want to differentiate hot reset of mulitple in-use devices vs hot reset
2011  * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
2012  * of doing hot resets when there is only a single device per bus.  The in-use
2013  * here refers to how many VFIODevices are affected.  A hot reset that affects
2014  * multiple devices, but only a single in-use device, means that we can call
2015  * it from our bus ->reset() callback since the extent is effectively a single
2016  * device.  This allows us to make use of it in the hotplug path.  When there
2017  * are multiple in-use devices, we can only trigger the hot reset during a
2018  * system reset and thus from our reset handler.  We separate _one vs _multi
2019  * here so that we don't overlap and do a double reset on the system reset
2020  * path where both our reset handler and ->reset() callback are used.  Calling
2021  * _one() will only do a hot reset for the one in-use devices case, calling
2022  * _multi() will do nothing if a _one() would have been sufficient.
2023  */
2024 static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2025 {
2026     return vfio_pci_hot_reset(vdev, true);
2027 }
2028
2029 static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2030 {
2031     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2032     return vfio_pci_hot_reset(vdev, false);
2033 }
2034
2035 static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2036 {
2037     VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2038     if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2039         vbasedev->needs_reset = true;
2040     }
2041 }
2042
2043 static VFIODeviceOps vfio_pci_ops = {
2044     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2045     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2046     .vfio_eoi = vfio_intx_eoi,
2047 };
2048
2049 static int vfio_populate_device(VFIOPCIDevice *vdev)
2050 {
2051     VFIODevice *vbasedev = &vdev->vbasedev;
2052     struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
2053     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
2054     int i, ret = -1;
2055
2056     /* Sanity check device */
2057     if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
2058         error_report("vfio: Um, this isn't a PCI device");
2059         goto error;
2060     }
2061
2062     if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
2063         error_report("vfio: unexpected number of io regions %u",
2064                      vbasedev->num_regions);
2065         goto error;
2066     }
2067
2068     if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
2069         error_report("vfio: unexpected number of irqs %u", vbasedev->num_irqs);
2070         goto error;
2071     }
2072
2073     for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2074         reg_info.index = i;
2075
2076         ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2077         if (ret) {
2078             error_report("vfio: Error getting region %d info: %m", i);
2079             goto error;
2080         }
2081
2082         trace_vfio_populate_device_region(vbasedev->name, i,
2083                                           (unsigned long)reg_info.size,
2084                                           (unsigned long)reg_info.offset,
2085                                           (unsigned long)reg_info.flags);
2086
2087         vdev->bars[i].region.vbasedev = vbasedev;
2088         vdev->bars[i].region.flags = reg_info.flags;
2089         vdev->bars[i].region.size = reg_info.size;
2090         vdev->bars[i].region.fd_offset = reg_info.offset;
2091         vdev->bars[i].region.nr = i;
2092         QLIST_INIT(&vdev->bars[i].quirks);
2093     }
2094
2095     reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
2096
2097     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
2098     if (ret) {
2099         error_report("vfio: Error getting config info: %m");
2100         goto error;
2101     }
2102
2103     trace_vfio_populate_device_config(vdev->vbasedev.name,
2104                                       (unsigned long)reg_info.size,
2105                                       (unsigned long)reg_info.offset,
2106                                       (unsigned long)reg_info.flags);
2107
2108     vdev->config_size = reg_info.size;
2109     if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2110         vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2111     }
2112     vdev->config_offset = reg_info.offset;
2113
2114     if ((vdev->features & VFIO_FEATURE_ENABLE_VGA) &&
2115         vbasedev->num_regions > VFIO_PCI_VGA_REGION_INDEX) {
2116         struct vfio_region_info vga_info = {
2117             .argsz = sizeof(vga_info),
2118             .index = VFIO_PCI_VGA_REGION_INDEX,
2119          };
2120
2121         ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, &vga_info);
2122         if (ret) {
2123             error_report(
2124                 "vfio: Device does not support requested feature x-vga");
2125             goto error;
2126         }
2127
2128         if (!(vga_info.flags & VFIO_REGION_INFO_FLAG_READ) ||
2129             !(vga_info.flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2130             vga_info.size < 0xbffff + 1) {
2131             error_report("vfio: Unexpected VGA info, flags 0x%lx, size 0x%lx",
2132                          (unsigned long)vga_info.flags,
2133                          (unsigned long)vga_info.size);
2134             goto error;
2135         }
2136
2137         vdev->vga.fd_offset = vga_info.offset;
2138         vdev->vga.fd = vdev->vbasedev.fd;
2139
2140         vdev->vga.region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2141         vdev->vga.region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2142         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_MEM].quirks);
2143
2144         vdev->vga.region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2145         vdev->vga.region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2146         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_LO].quirks);
2147
2148         vdev->vga.region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2149         vdev->vga.region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2150         QLIST_INIT(&vdev->vga.region[QEMU_PCI_VGA_IO_HI].quirks);
2151
2152         vdev->has_vga = true;
2153     }
2154
2155     irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
2156
2157     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
2158     if (ret) {
2159         /* This can fail for an old kernel or legacy PCI dev */
2160         trace_vfio_populate_device_get_irq_info_failure();
2161         ret = 0;
2162     } else if (irq_info.count == 1) {
2163         vdev->pci_aer = true;
2164     } else {
2165         error_report("vfio: %s "
2166                      "Could not enable error recovery for the device",
2167                      vbasedev->name);
2168     }
2169
2170 error:
2171     return ret;
2172 }
2173
2174 static void vfio_put_device(VFIOPCIDevice *vdev)
2175 {
2176     g_free(vdev->vbasedev.name);
2177     if (vdev->msix) {
2178         object_unparent(OBJECT(&vdev->msix->mmap_mem));
2179         g_free(vdev->msix);
2180         vdev->msix = NULL;
2181     }
2182     vfio_put_base_device(&vdev->vbasedev);
2183 }
2184
2185 static void vfio_err_notifier_handler(void *opaque)
2186 {
2187     VFIOPCIDevice *vdev = opaque;
2188
2189     if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
2190         return;
2191     }
2192
2193     /*
2194      * TBD. Retrieve the error details and decide what action
2195      * needs to be taken. One of the actions could be to pass
2196      * the error to the guest and have the guest driver recover
2197      * from the error. This requires that PCIe capabilities be
2198      * exposed to the guest. For now, we just terminate the
2199      * guest to contain the error.
2200      */
2201
2202     error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected.  "
2203                  "Please collect any data possible and then kill the guest",
2204                  __func__, vdev->host.domain, vdev->host.bus,
2205                  vdev->host.slot, vdev->host.function);
2206
2207     vm_stop(RUN_STATE_INTERNAL_ERROR);
2208 }
2209
2210 /*
2211  * Registers error notifier for devices supporting error recovery.
2212  * If we encounter a failure in this function, we report an error
2213  * and continue after disabling error recovery support for the
2214  * device.
2215  */
2216 static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
2217 {
2218     int ret;
2219     int argsz;
2220     struct vfio_irq_set *irq_set;
2221     int32_t *pfd;
2222
2223     if (!vdev->pci_aer) {
2224         return;
2225     }
2226
2227     if (event_notifier_init(&vdev->err_notifier, 0)) {
2228         error_report("vfio: Unable to init event notifier for error detection");
2229         vdev->pci_aer = false;
2230         return;
2231     }
2232
2233     argsz = sizeof(*irq_set) + sizeof(*pfd);
2234
2235     irq_set = g_malloc0(argsz);
2236     irq_set->argsz = argsz;
2237     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
2238                      VFIO_IRQ_SET_ACTION_TRIGGER;
2239     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
2240     irq_set->start = 0;
2241     irq_set->count = 1;
2242     pfd = (int32_t *)&irq_set->data;
2243
2244     *pfd = event_notifier_get_fd(&vdev->err_notifier);
2245     qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
2246
2247     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
2248     if (ret) {
2249         error_report("vfio: Failed to set up error notification");
2250         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
2251         event_notifier_cleanup(&vdev->err_notifier);
2252         vdev->pci_aer = false;
2253     }
2254     g_free(irq_set);
2255 }
2256
2257 static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
2258 {
2259     int argsz;
2260     struct vfio_irq_set *irq_set;
2261     int32_t *pfd;
2262     int ret;
2263
2264     if (!vdev->pci_aer) {
2265         return;
2266     }
2267
2268     argsz = sizeof(*irq_set) + sizeof(*pfd);
2269
2270     irq_set = g_malloc0(argsz);
2271     irq_set->argsz = argsz;
2272     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
2273                      VFIO_IRQ_SET_ACTION_TRIGGER;
2274     irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
2275     irq_set->start = 0;
2276     irq_set->count = 1;
2277     pfd = (int32_t *)&irq_set->data;
2278     *pfd = -1;
2279
2280     ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
2281     if (ret) {
2282         error_report("vfio: Failed to de-assign error fd: %m");
2283     }
2284     g_free(irq_set);
2285     qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
2286                         NULL, NULL, vdev);
2287     event_notifier_cleanup(&vdev->err_notifier);
2288 }
2289
2290 static void vfio_req_notifier_handler(void *opaque)
2291 {
2292     VFIOPCIDevice *vdev = opaque;
2293
2294     if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
2295         return;
2296     }
2297
2298     qdev_unplug(&vdev->pdev.qdev, NULL);
2299 }
2300
2301 static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
2302 {
2303     struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
2304                                       .index = VFIO_PCI_REQ_IRQ_INDEX };
2305     int argsz;
2306     struct vfio_irq_set *irq_set;
2307     int32_t *pfd;
2308
2309     if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
2310         return;
2311     }
2312
2313     if (ioctl(vdev->vbasedev.fd,
2314               VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
2315         return;
2316     }
2317
2318     if (event_notifier_init(&vdev->req_notifier, 0)) {
2319         error_report("vfio: Unable to init event notifier for device request");
2320         return;
2321     }
2322
2323     argsz = sizeof(*irq_set) + sizeof(*pfd);
2324
2325     irq_set = g_malloc0(argsz);
2326     irq_set->argsz = argsz;
2327     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
2328                      VFIO_IRQ_SET_ACTION_TRIGGER;
2329     irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
2330     irq_set->start = 0;
2331     irq_set->count = 1;
2332     pfd = (int32_t *)&irq_set->data;
2333
2334     *pfd = event_notifier_get_fd(&vdev->req_notifier);
2335     qemu_set_fd_handler(*pfd, vfio_req_notifier_handler, NULL, vdev);
2336
2337     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
2338         error_report("vfio: Failed to set up device request notification");
2339         qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
2340         event_notifier_cleanup(&vdev->req_notifier);
2341     } else {
2342         vdev->req_enabled = true;
2343     }
2344
2345     g_free(irq_set);
2346 }
2347
2348 static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
2349 {
2350     int argsz;
2351     struct vfio_irq_set *irq_set;
2352     int32_t *pfd;
2353
2354     if (!vdev->req_enabled) {
2355         return;
2356     }
2357
2358     argsz = sizeof(*irq_set) + sizeof(*pfd);
2359
2360     irq_set = g_malloc0(argsz);
2361     irq_set->argsz = argsz;
2362     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
2363                      VFIO_IRQ_SET_ACTION_TRIGGER;
2364     irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
2365     irq_set->start = 0;
2366     irq_set->count = 1;
2367     pfd = (int32_t *)&irq_set->data;
2368     *pfd = -1;
2369
2370     if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
2371         error_report("vfio: Failed to de-assign device request fd: %m");
2372     }
2373     g_free(irq_set);
2374     qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
2375                         NULL, NULL, vdev);
2376     event_notifier_cleanup(&vdev->req_notifier);
2377
2378     vdev->req_enabled = false;
2379 }
2380
2381 static int vfio_initfn(PCIDevice *pdev)
2382 {
2383     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
2384     VFIODevice *vbasedev_iter;
2385     VFIOGroup *group;
2386     char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
2387     ssize_t len;
2388     struct stat st;
2389     int groupid;
2390     int ret;
2391
2392     /* Check that the host device exists */
2393     snprintf(path, sizeof(path),
2394              "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
2395              vdev->host.domain, vdev->host.bus, vdev->host.slot,
2396              vdev->host.function);
2397     if (stat(path, &st) < 0) {
2398         error_report("vfio: error: no such host device: %s", path);
2399         return -errno;
2400     }
2401
2402     vdev->vbasedev.ops = &vfio_pci_ops;
2403
2404     vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
2405     vdev->vbasedev.name = g_strdup_printf("%04x:%02x:%02x.%01x",
2406                                           vdev->host.domain, vdev->host.bus,
2407                                           vdev->host.slot, vdev->host.function);
2408
2409     strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
2410
2411     len = readlink(path, iommu_group_path, sizeof(path));
2412     if (len <= 0 || len >= sizeof(path)) {
2413         error_report("vfio: error no iommu_group for device");
2414         return len < 0 ? -errno : -ENAMETOOLONG;
2415     }
2416
2417     iommu_group_path[len] = 0;
2418     group_name = basename(iommu_group_path);
2419
2420     if (sscanf(group_name, "%d", &groupid) != 1) {
2421         error_report("vfio: error reading %s: %m", path);
2422         return -errno;
2423     }
2424
2425     trace_vfio_initfn(vdev->vbasedev.name, groupid);
2426
2427     group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev));
2428     if (!group) {
2429         error_report("vfio: failed to get group %d", groupid);
2430         return -ENOENT;
2431     }
2432
2433     snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
2434             vdev->host.domain, vdev->host.bus, vdev->host.slot,
2435             vdev->host.function);
2436
2437     QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2438         if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) {
2439             error_report("vfio: error: device %s is already attached", path);
2440             vfio_put_group(group);
2441             return -EBUSY;
2442         }
2443     }
2444
2445     ret = vfio_get_device(group, path, &vdev->vbasedev);
2446     if (ret) {
2447         error_report("vfio: failed to get device %s", path);
2448         vfio_put_group(group);
2449         return ret;
2450     }
2451
2452     ret = vfio_populate_device(vdev);
2453     if (ret) {
2454         return ret;
2455     }
2456
2457     /* Get a copy of config space */
2458     ret = pread(vdev->vbasedev.fd, vdev->pdev.config,
2459                 MIN(pci_config_size(&vdev->pdev), vdev->config_size),
2460                 vdev->config_offset);
2461     if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
2462         ret = ret < 0 ? -errno : -EFAULT;
2463         error_report("vfio: Failed to read device config space");
2464         return ret;
2465     }
2466
2467     /* vfio emulates a lot for us, but some bits need extra love */
2468     vdev->emulated_config_bits = g_malloc0(vdev->config_size);
2469
2470     /* QEMU can choose to expose the ROM or not */
2471     memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
2472
2473     /*
2474      * The PCI spec reserves vendor ID 0xffff as an invalid value.  The
2475      * device ID is managed by the vendor and need only be a 16-bit value.
2476      * Allow any 16-bit value for subsystem so they can be hidden or changed.
2477      */
2478     if (vdev->vendor_id != PCI_ANY_ID) {
2479         if (vdev->vendor_id >= 0xffff) {
2480             error_report("vfio: Invalid PCI vendor ID provided");
2481             return -EINVAL;
2482         }
2483         vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
2484         trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id);
2485     } else {
2486         vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
2487     }
2488
2489     if (vdev->device_id != PCI_ANY_ID) {
2490         if (vdev->device_id > 0xffff) {
2491             error_report("vfio: Invalid PCI device ID provided");
2492             return -EINVAL;
2493         }
2494         vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
2495         trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id);
2496     } else {
2497         vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
2498     }
2499
2500     if (vdev->sub_vendor_id != PCI_ANY_ID) {
2501         if (vdev->sub_vendor_id > 0xffff) {
2502             error_report("vfio: Invalid PCI subsystem vendor ID provided");
2503             return -EINVAL;
2504         }
2505         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
2506                                vdev->sub_vendor_id, ~0);
2507         trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name,
2508                                               vdev->sub_vendor_id);
2509     }
2510
2511     if (vdev->sub_device_id != PCI_ANY_ID) {
2512         if (vdev->sub_device_id > 0xffff) {
2513             error_report("vfio: Invalid PCI subsystem device ID provided");
2514             return -EINVAL;
2515         }
2516         vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
2517         trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name,
2518                                               vdev->sub_device_id);
2519     }
2520
2521     /* QEMU can change multi-function devices to single function, or reverse */
2522     vdev->emulated_config_bits[PCI_HEADER_TYPE] =
2523                                               PCI_HEADER_TYPE_MULTI_FUNCTION;
2524
2525     /* Restore or clear multifunction, this is always controlled by QEMU */
2526     if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
2527         vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
2528     } else {
2529         vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
2530     }
2531
2532     /*
2533      * Clear host resource mapping info.  If we choose not to register a
2534      * BAR, such as might be the case with the option ROM, we can get
2535      * confusing, unwritable, residual addresses from the host here.
2536      */
2537     memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
2538     memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
2539
2540     vfio_pci_size_rom(vdev);
2541
2542     ret = vfio_msix_early_setup(vdev);
2543     if (ret) {
2544         return ret;
2545     }
2546
2547     vfio_map_bars(vdev);
2548
2549     ret = vfio_add_capabilities(vdev);
2550     if (ret) {
2551         goto out_teardown;
2552     }
2553
2554     /* QEMU emulates all of MSI & MSIX */
2555     if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
2556         memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
2557                MSIX_CAP_LENGTH);
2558     }
2559
2560     if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
2561         memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
2562                vdev->msi_cap_size);
2563     }
2564
2565     if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
2566         vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
2567                                                   vfio_intx_mmap_enable, vdev);
2568         pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_intx_update);
2569         ret = vfio_intx_enable(vdev);
2570         if (ret) {
2571             goto out_teardown;
2572         }
2573     }
2574
2575     vfio_register_err_notifier(vdev);
2576     vfio_register_req_notifier(vdev);
2577     vfio_setup_resetfn_quirk(vdev);
2578
2579     return 0;
2580
2581 out_teardown:
2582     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
2583     vfio_teardown_msi(vdev);
2584     vfio_unregister_bars(vdev);
2585     return ret;
2586 }
2587
2588 static void vfio_instance_finalize(Object *obj)
2589 {
2590     PCIDevice *pci_dev = PCI_DEVICE(obj);
2591     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pci_dev);
2592     VFIOGroup *group = vdev->vbasedev.group;
2593
2594     vfio_unmap_bars(vdev);
2595     g_free(vdev->emulated_config_bits);
2596     g_free(vdev->rom);
2597     vfio_put_device(vdev);
2598     vfio_put_group(group);
2599 }
2600
2601 static void vfio_exitfn(PCIDevice *pdev)
2602 {
2603     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
2604
2605     vfio_unregister_req_notifier(vdev);
2606     vfio_unregister_err_notifier(vdev);
2607     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
2608     vfio_disable_interrupts(vdev);
2609     if (vdev->intx.mmap_timer) {
2610         timer_free(vdev->intx.mmap_timer);
2611     }
2612     vfio_teardown_msi(vdev);
2613     vfio_unregister_bars(vdev);
2614 }
2615
2616 static void vfio_pci_reset(DeviceState *dev)
2617 {
2618     PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
2619     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev);
2620
2621     trace_vfio_pci_reset(vdev->vbasedev.name);
2622
2623     vfio_pci_pre_reset(vdev);
2624
2625     if (vdev->resetfn && !vdev->resetfn(vdev)) {
2626         goto post_reset;
2627     }
2628
2629     if (vdev->vbasedev.reset_works &&
2630         (vdev->has_flr || !vdev->has_pm_reset) &&
2631         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
2632         trace_vfio_pci_reset_flr(vdev->vbasedev.name);
2633         goto post_reset;
2634     }
2635
2636     /* See if we can do our own bus reset */
2637     if (!vfio_pci_hot_reset_one(vdev)) {
2638         goto post_reset;
2639     }
2640
2641     /* If nothing else works and the device supports PM reset, use it */
2642     if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
2643         !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
2644         trace_vfio_pci_reset_pm(vdev->vbasedev.name);
2645         goto post_reset;
2646     }
2647
2648 post_reset:
2649     vfio_pci_post_reset(vdev);
2650 }
2651
2652 static void vfio_instance_init(Object *obj)
2653 {
2654     PCIDevice *pci_dev = PCI_DEVICE(obj);
2655     VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, PCI_DEVICE(obj));
2656
2657     device_add_bootindex_property(obj, &vdev->bootindex,
2658                                   "bootindex", NULL,
2659                                   &pci_dev->qdev, NULL);
2660 }
2661
2662 static Property vfio_pci_dev_properties[] = {
2663     DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
2664     DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
2665                        intx.mmap_timeout, 1100),
2666     DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
2667                     VFIO_FEATURE_ENABLE_VGA_BIT, false),
2668     DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
2669                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
2670     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
2671     DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
2672     DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
2673     DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
2674     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
2675     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
2676     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
2677                        sub_vendor_id, PCI_ANY_ID),
2678     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
2679                        sub_device_id, PCI_ANY_ID),
2680     /*
2681      * TODO - support passed fds... is this necessary?
2682      * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
2683      * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
2684      */
2685     DEFINE_PROP_END_OF_LIST(),
2686 };
2687
2688 static const VMStateDescription vfio_pci_vmstate = {
2689     .name = "vfio-pci",
2690     .unmigratable = 1,
2691 };
2692
2693 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
2694 {
2695     DeviceClass *dc = DEVICE_CLASS(klass);
2696     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
2697
2698     dc->reset = vfio_pci_reset;
2699     dc->props = vfio_pci_dev_properties;
2700     dc->vmsd = &vfio_pci_vmstate;
2701     dc->desc = "VFIO-based PCI device assignment";
2702     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
2703     pdc->init = vfio_initfn;
2704     pdc->exit = vfio_exitfn;
2705     pdc->config_read = vfio_pci_read_config;
2706     pdc->config_write = vfio_pci_write_config;
2707     pdc->is_express = 1; /* We might be */
2708 }
2709
2710 static const TypeInfo vfio_pci_dev_info = {
2711     .name = "vfio-pci",
2712     .parent = TYPE_PCI_DEVICE,
2713     .instance_size = sizeof(VFIOPCIDevice),
2714     .class_init = vfio_pci_dev_class_init,
2715     .instance_init = vfio_instance_init,
2716     .instance_finalize = vfio_instance_finalize,
2717 };
2718
2719 static void register_vfio_pci_dev_type(void)
2720 {
2721     type_register_static(&vfio_pci_dev_info);
2722 }
2723
2724 type_init(register_vfio_pci_dev_type)