/* Extra debugging, trap acceleration paths for more logging */
#define VFIO_ALLOW_MMAP 1
#define VFIO_ALLOW_KVM_INTX 1
+#define VFIO_ALLOW_KVM_MSI 1
+#define VFIO_ALLOW_KVM_MSIX 1
struct VFIODevice;
struct VFIOGroup;
+typedef struct VFIOType1 {
+ MemoryListener listener;
+ int error;
+ bool initialized;
+} VFIOType1;
+
typedef struct VFIOContainer {
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
struct {
/* enable abstraction to support various iommu backends */
union {
- MemoryListener listener; /* Used by type1 iommu */
+ VFIOType1 type1;
};
void (*release)(struct VFIOContainer *);
} iommu_data;
off_t config_offset; /* Offset of config space region within device fd */
unsigned int rom_size;
off_t rom_offset; /* Offset of ROM region within device fd */
+ void *rom;
int msi_cap_size;
VFIOMSIVector *msi_vectors;
VFIOMSIXInfo *msix;
bool pci_aer;
bool has_flr;
bool has_pm_reset;
+ bool needs_reset;
+ bool rom_read_failed;
} VFIODevice;
typedef struct VFIOGroup {
static QLIST_HEAD(, VFIOGroup)
group_list = QLIST_HEAD_INITIALIZER(group_list);
+#ifdef CONFIG_KVM
+/*
+ * We have a single VFIO pseudo device per KVM VM. Once created it lives
+ * for the life of the VM. Closing the file descriptor only drops our
+ * reference to it and the device's reference to kvm. Therefore once
+ * initialized, this file descriptor is only released on QEMU exit and
+ * we'll re-use it should another vfio device be attached before then.
+ */
+static int vfio_kvm_device_fd = -1;
+#endif
+
static void vfio_disable_interrupts(VFIODevice *vdev);
static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
'A' + vdev->intx.pin);
vdev->intx.pending = true;
- qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
+ pci_irq_assert(&vdev->pdev);
vfio_mmap_set_enabled(vdev, false);
if (vdev->intx.mmap_timeout) {
timer_mod(vdev->intx.mmap_timer,
vdev->host.bus, vdev->host.slot, vdev->host.function);
vdev->intx.pending = false;
- qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+ pci_irq_deassert(&vdev->pdev);
vfio_unmask_intx(vdev);
}
qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
vfio_mask_intx(vdev);
vdev->intx.pending = false;
- qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+ pci_irq_deassert(&vdev->pdev);
/* Get an eventfd for resample/unmask */
if (event_notifier_init(&vdev->intx.unmask, 0)) {
*/
vfio_mask_intx(vdev);
vdev->intx.pending = false;
- qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+ pci_irq_deassert(&vdev->pdev);
/* Tell KVM to stop listening for an INTx irqfd */
if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
vfio_disable_interrupts(vdev);
vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
+ pci_config_set_interrupt_pin(vdev->pdev.config, pin);
#ifdef CONFIG_KVM
/*
vfio_disable_intx_kvm(vdev);
vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
vdev->intx.pending = false;
- qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+ pci_irq_deassert(&vdev->pdev);
vfio_mmap_set_enabled(vdev, true);
fd = event_notifier_get_fd(&vdev->intx.interrupt);
return;
}
- DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __func__,
+#ifdef DEBUG_VFIO
+ MSIMessage msg;
+
+ if (vdev->interrupt == VFIO_INT_MSIX) {
+ msg = msi_get_message(&vdev->pdev, nr);
+ } else if (vdev->interrupt == VFIO_INT_MSI) {
+ msg = msix_get_message(&vdev->pdev, nr);
+ } else {
+ abort();
+ }
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) vector %d 0x%"PRIx64"/0x%x\n", __func__,
vdev->host.domain, vdev->host.bus, vdev->host.slot,
- vdev->host.function, nr);
+ vdev->host.function, nr, msg.address, msg.data);
+#endif
if (vdev->interrupt == VFIO_INT_MSIX) {
msix_notify(&vdev->pdev, nr);
* Attempt to enable route through KVM irqchip,
* default to userspace handling if unavailable.
*/
- vector->virq = msg ? kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
+ vector->virq = msg && VFIO_ALLOW_KVM_MSIX ?
+ kvm_irqchip_add_msi_route(kvm_state, *msg) : -1;
if (vector->virq < 0 ||
kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
NULL, vector->virq) < 0) {
* Attempt to enable route through KVM irqchip,
* default to userspace handling if unavailable.
*/
- vector->virq = kvm_irqchip_add_msi_route(kvm_state, vector->msg);
+ vector->virq = VFIO_ALLOW_KVM_MSI ?
+ kvm_irqchip_add_msi_route(kvm_state, vector->msg) : -1;
if (vector->virq < 0 ||
kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
NULL, vector->virq) < 0) {
static void vfio_disable_msix(VFIODevice *vdev)
{
+ int i;
+
msix_unset_vector_notifiers(&vdev->pdev);
+ /*
+ * MSI-X will only release vectors if MSI-X is still enabled on the
+ * device, check through the rest and release it ourselves if necessary.
+ */
+ for (i = 0; i < vdev->nr_vectors; i++) {
+ if (vdev->msi_vectors[i].use) {
+ vfio_msix_vector_release(&vdev->pdev, i);
+ }
+ }
+
if (vdev->nr_vectors) {
vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
}
.endianness = DEVICE_LITTLE_ENDIAN,
};
+static void vfio_pci_load_rom(VFIODevice *vdev)
+{
+ struct vfio_region_info reg_info = {
+ .argsz = sizeof(reg_info),
+ .index = VFIO_PCI_ROM_REGION_INDEX
+ };
+ uint64_t size;
+ off_t off = 0;
+ size_t bytes;
+
+ if (ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info)) {
+ error_report("vfio: Error getting ROM info: %m");
+ return;
+ }
+
+ DPRINTF("Device %04x:%02x:%02x.%x ROM:\n", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+ DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+ (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+ (unsigned long)reg_info.flags);
+
+ vdev->rom_size = size = reg_info.size;
+ vdev->rom_offset = reg_info.offset;
+
+ if (!vdev->rom_size) {
+ vdev->rom_read_failed = true;
+ error_report("vfio-pci: Cannot read device rom at "
+ "%04x:%02x:%02x.%x\n",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
+ error_printf("Device option ROM contents are probably invalid "
+ "(check dmesg).\nSkip option ROM probe with rombar=0, "
+ "or load from file with romfile=\n");
+ return;
+ }
+
+ vdev->rom = g_malloc(size);
+ memset(vdev->rom, 0xff, size);
+
+ while (size) {
+ bytes = pread(vdev->fd, vdev->rom + off, size, vdev->rom_offset + off);
+ if (bytes == 0) {
+ break;
+ } else if (bytes > 0) {
+ off += bytes;
+ size -= bytes;
+ } else {
+ if (errno == EINTR || errno == EAGAIN) {
+ continue;
+ }
+ error_report("vfio: Error reading device ROM: %m");
+ break;
+ }
+ }
+}
+
+static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
+{
+ VFIODevice *vdev = opaque;
+ uint64_t val = ((uint64_t)1 << (size * 8)) - 1;
+
+ /* Load the ROM lazily when the guest tries to read it */
+ if (unlikely(!vdev->rom)) {
+ vfio_pci_load_rom(vdev);
+ if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
+ vfio_pci_load_rom(vdev);
+ }
+ }
+
+ memcpy(&val, vdev->rom + addr,
+ (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
+
+ DPRINTF("%s(%04x:%02x:%02x.%x, 0x%"HWADDR_PRIx", 0x%x) = 0x%"PRIx64"\n",
+ __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function, addr, size, val);
+
+ return val;
+}
+
+static void vfio_rom_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+}
+
+static const MemoryRegionOps vfio_rom_ops = {
+ .read = vfio_rom_read,
+ .write = vfio_rom_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_pci_size_rom(VFIODevice *vdev)
+{
+ uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
+ off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
+ char name[32];
+
+ if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
+ return;
+ }
+
+ /*
+ * Use the same size ROM BAR as the physical device. The contents
+ * will get filled in later when the guest tries to read it.
+ */
+ if (pread(vdev->fd, &orig, 4, offset) != 4 ||
+ pwrite(vdev->fd, &size, 4, offset) != 4 ||
+ pread(vdev->fd, &size, 4, offset) != 4 ||
+ pwrite(vdev->fd, &orig, 4, offset) != 4) {
+ error_report("%s(%04x:%02x:%02x.%x) failed: %m",
+ __func__, vdev->host.domain, vdev->host.bus,
+ vdev->host.slot, vdev->host.function);
+ return;
+ }
+
+ size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
+
+ if (!size) {
+ return;
+ }
+
+ DPRINTF("%04x:%02x:%02x.%x ROM size 0x%x\n", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function, size);
+
+ snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
+
+ memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
+ &vfio_rom_ops, vdev, name, size);
+
+ pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
+ PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
+
+ vdev->pdev.has_rom = true;
+ vdev->rom_read_failed = false;
+}
+
static void vfio_vga_write(void *opaque, hwaddr addr,
uint64_t data, unsigned size)
{
vdev->host.function);
}
+static void vfio_nvidia_88000_quirk_write(void *opaque, hwaddr addr,
+ uint64_t data, unsigned size)
+{
+ VFIOQuirk *quirk = opaque;
+ VFIODevice *vdev = quirk->vdev;
+ PCIDevice *pdev = &vdev->pdev;
+ hwaddr base = quirk->data.address_match & TARGET_PAGE_MASK;
+
+ vfio_generic_quirk_write(opaque, addr, data, size);
+
+ /*
+ * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
+ * MSI capability ID register. Both the ID and next register are
+ * read-only, so we allow writes covering either of those to real hw.
+ * NB - only fixed for the 0x88000 MMIO window.
+ */
+ if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
+ vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
+ vfio_bar_write(&vdev->bars[quirk->data.bar], addr + base, data, size);
+ }
+}
+
+static const MemoryRegionOps vfio_nvidia_88000_quirk = {
+ .read = vfio_generic_quirk_read,
+ .write = vfio_nvidia_88000_quirk_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
/*
* Finally, BAR0 itself. We want to redirect any accesses to either
* 0x1800 or 0x88000 through the PCI config space access functions.
quirk->data.address_mask = PCIE_CONFIG_SPACE_SIZE - 1;
quirk->data.bar = nr;
- memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_generic_quirk,
+ memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_nvidia_88000_quirk,
quirk, "vfio-nvidia-bar0-88000-quirk",
TARGET_PAGE_ALIGN(quirk->data.address_mask + 1));
memory_region_add_subregion_overlap(&vdev->bars[nr].mem,
while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) {
VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks);
memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem);
+ memory_region_destroy(&quirk->mem);
QLIST_REMOVE(quirk, next);
g_free(quirk);
}
while (!QLIST_EMPTY(&bar->quirks)) {
VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
memory_region_del_subregion(&bar->mem, &quirk->mem);
+ memory_region_destroy(&quirk->mem);
QLIST_REMOVE(quirk, next);
g_free(quirk);
}
static bool vfio_listener_skipped_section(MemoryRegionSection *section)
{
- return !memory_region_is_ram(section->mr);
+ return !memory_region_is_ram(section->mr) ||
+ /*
+ * Sizing an enabled 64-bit BAR can cause spurious mappings to
+ * addresses in the upper part of the 64-bit address space. These
+ * are never accessed by the CPU and beyond the address width of
+ * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
+ */
+ section->offset_within_address_space & (1ULL << 63);
}
static void vfio_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer,
- iommu_data.listener);
+ iommu_data.type1.listener);
hwaddr iova, end;
void *vaddr;
int ret;
if (vfio_listener_skipped_section(section)) {
DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n",
section->offset_within_address_space,
- section->offset_within_address_space + section->size - 1);
+ section->offset_within_address_space +
+ int128_get64(int128_sub(section->size, int128_one())));
return;
}
error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%m)",
container, iova, end - iova, vaddr, ret);
+
+ /*
+ * On the initfn path, store the first error in the container so we
+ * can gracefully fail. Runtime, there's not much we can do other
+ * than throw a hardware error.
+ */
+ if (!container->iommu_data.type1.initialized) {
+ if (!container->iommu_data.type1.error) {
+ container->iommu_data.type1.error = ret;
+ }
+ } else {
+ hw_error("vfio: DMA mapping failed, unable to continue\n");
+ }
}
}
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer,
- iommu_data.listener);
+ iommu_data.type1.listener);
hwaddr iova, end;
int ret;
if (vfio_listener_skipped_section(section)) {
DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n",
section->offset_within_address_space,
- section->offset_within_address_space + section->size - 1);
+ section->offset_within_address_space +
+ int128_get64(int128_sub(section->size, int128_one())));
return;
}
static void vfio_listener_release(VFIOContainer *container)
{
- memory_listener_unregister(&container->iommu_data.listener);
+ memory_listener_unregister(&container->iommu_data.type1.listener);
}
/*
memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
+ memory_region_destroy(&bar->mmap_mem);
if (vdev->msix && vdev->msix->table_bar == nr) {
memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
+ memory_region_destroy(&vdev->msix->mmap_mem);
}
memory_region_destroy(&bar->mem);
* potentially insert a direct-mapped subregion before and after it.
*/
if (vdev->msix && vdev->msix->table_bar == nr) {
- size = vdev->msix->table_offset & TARGET_PAGE_MASK;
+ size = vdev->msix->table_offset & qemu_host_page_mask;
}
strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
if (vdev->msix && vdev->msix->table_bar == nr) {
unsigned start;
- start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
- (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
+ start = HOST_PAGE_ALIGN(vdev->msix->table_offset +
+ (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
size = start < bar->size ? bar->size - start : 0;
strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
}
-static int vfio_load_rom(VFIODevice *vdev)
+static void vfio_pci_pre_reset(VFIODevice *vdev)
{
- uint64_t size = vdev->rom_size;
- char name[32];
- off_t off = 0, voff = vdev->rom_offset;
- ssize_t bytes;
- void *ptr;
+ PCIDevice *pdev = &vdev->pdev;
+ uint16_t cmd;
- /* If loading ROM from file, pci handles it */
- if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
- return 0;
+ vfio_disable_interrupts(vdev);
+
+ /* Make sure the device is in D0 */
+ if (vdev->pm_cap) {
+ uint16_t pmcsr;
+ uint8_t state;
+
+ pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
+ state = pmcsr & PCI_PM_CTRL_STATE_MASK;
+ if (state) {
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
+ /* vfio handles the necessary delay here */
+ pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
+ state = pmcsr & PCI_PM_CTRL_STATE_MASK;
+ if (state) {
+ error_report("vfio: Unable to power on device, stuck in D%d\n",
+ state);
+ }
+ }
}
- DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
- vdev->host.bus, vdev->host.slot, vdev->host.function);
+ /*
+ * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
+ * Also put INTx Disable in known state.
+ */
+ cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
+ cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
+ PCI_COMMAND_INTX_DISABLE);
+ vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
+}
- snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
- vdev->host.domain, vdev->host.bus, vdev->host.slot,
- vdev->host.function);
- memory_region_init_ram(&vdev->pdev.rom, OBJECT(vdev), name, size);
- ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
- memset(ptr, 0xff, size);
+static void vfio_pci_post_reset(VFIODevice *vdev)
+{
+ vfio_enable_intx(vdev);
+}
- while (size) {
- bytes = pread(vdev->fd, ptr + off, size, voff + off);
- if (bytes == 0) {
- break; /* expect that we could get back less than the ROM BAR */
- } else if (bytes > 0) {
- off += bytes;
- size -= bytes;
- } else {
- if (errno == EINTR || errno == EAGAIN) {
- continue;
+static bool vfio_pci_host_match(PCIHostDeviceAddress *host1,
+ PCIHostDeviceAddress *host2)
+{
+ return (host1->domain == host2->domain && host1->bus == host2->bus &&
+ host1->slot == host2->slot && host1->function == host2->function);
+}
+
+static int vfio_pci_hot_reset(VFIODevice *vdev, bool single)
+{
+ VFIOGroup *group;
+ struct vfio_pci_hot_reset_info *info;
+ struct vfio_pci_dependent_device *devices;
+ struct vfio_pci_hot_reset *reset;
+ int32_t *fds;
+ int ret, i, count;
+ bool multi = false;
+
+ DPRINTF("%s(%04x:%02x:%02x.%x) %s\n", __func__, vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function,
+ single ? "one" : "multi");
+
+ vfio_pci_pre_reset(vdev);
+ vdev->needs_reset = false;
+
+ info = g_malloc0(sizeof(*info));
+ info->argsz = sizeof(*info);
+
+ ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+ if (ret && errno != ENOSPC) {
+ ret = -errno;
+ if (!vdev->has_pm_reset) {
+ error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
+ "no available reset mechanism.", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+ }
+ goto out_single;
+ }
+
+ count = info->count;
+ info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
+ info->argsz = sizeof(*info) + (count * sizeof(*devices));
+ devices = &info->devices[0];
+
+ ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
+ if (ret) {
+ ret = -errno;
+ error_report("vfio: hot reset info failed: %m");
+ goto out_single;
+ }
+
+ DPRINTF("%04x:%02x:%02x.%x: hot reset dependent devices:\n",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
+
+ /* Verify that we have all the groups required */
+ for (i = 0; i < info->count; i++) {
+ PCIHostDeviceAddress host;
+ VFIODevice *tmp;
+
+ host.domain = devices[i].segment;
+ host.bus = devices[i].bus;
+ host.slot = PCI_SLOT(devices[i].devfn);
+ host.function = PCI_FUNC(devices[i].devfn);
+
+ DPRINTF("\t%04x:%02x:%02x.%x group %d\n", host.domain,
+ host.bus, host.slot, host.function, devices[i].group_id);
+
+ if (vfio_pci_host_match(&host, &vdev->host)) {
+ continue;
+ }
+
+ QLIST_FOREACH(group, &group_list, next) {
+ if (group->groupid == devices[i].group_id) {
+ break;
+ }
+ }
+
+ if (!group) {
+ if (!vdev->has_pm_reset) {
+ error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, "
+ "depends on group %d which is not owned.",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function, devices[i].group_id);
+ }
+ ret = -EPERM;
+ goto out;
+ }
+
+ /* Prep dependent devices for reset and clear our marker. */
+ QLIST_FOREACH(tmp, &group->device_list, next) {
+ if (vfio_pci_host_match(&host, &tmp->host)) {
+ if (single) {
+ DPRINTF("vfio: found another in-use device "
+ "%04x:%02x:%02x.%x\n", host.domain, host.bus,
+ host.slot, host.function);
+ ret = -EINVAL;
+ goto out_single;
+ }
+ vfio_pci_pre_reset(tmp);
+ tmp->needs_reset = false;
+ multi = true;
+ break;
}
- error_report("vfio: Error reading device ROM: %m");
- memory_region_destroy(&vdev->pdev.rom);
- return -errno;
}
}
- pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
- vdev->pdev.has_rom = true;
- return 0;
+ if (!single && !multi) {
+ DPRINTF("vfio: No other in-use devices for multi hot reset\n");
+ ret = -EINVAL;
+ goto out_single;
+ }
+
+ /* Determine how many group fds need to be passed */
+ count = 0;
+ QLIST_FOREACH(group, &group_list, next) {
+ for (i = 0; i < info->count; i++) {
+ if (group->groupid == devices[i].group_id) {
+ count++;
+ break;
+ }
+ }
+ }
+
+ reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
+ reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
+ fds = &reset->group_fds[0];
+
+ /* Fill in group fds */
+ QLIST_FOREACH(group, &group_list, next) {
+ for (i = 0; i < info->count; i++) {
+ if (group->groupid == devices[i].group_id) {
+ fds[reset->count++] = group->fd;
+ break;
+ }
+ }
+ }
+
+ /* Bus reset! */
+ ret = ioctl(vdev->fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
+ g_free(reset);
+
+ DPRINTF("%04x:%02x:%02x.%x hot reset: %s\n", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function,
+ ret ? "%m" : "Success");
+
+out:
+ /* Re-enable INTx on affected devices */
+ for (i = 0; i < info->count; i++) {
+ PCIHostDeviceAddress host;
+ VFIODevice *tmp;
+
+ host.domain = devices[i].segment;
+ host.bus = devices[i].bus;
+ host.slot = PCI_SLOT(devices[i].devfn);
+ host.function = PCI_FUNC(devices[i].devfn);
+
+ if (vfio_pci_host_match(&host, &vdev->host)) {
+ continue;
+ }
+
+ QLIST_FOREACH(group, &group_list, next) {
+ if (group->groupid == devices[i].group_id) {
+ break;
+ }
+ }
+
+ if (!group) {
+ break;
+ }
+
+ QLIST_FOREACH(tmp, &group->device_list, next) {
+ if (vfio_pci_host_match(&host, &tmp->host)) {
+ vfio_pci_post_reset(tmp);
+ break;
+ }
+ }
+ }
+out_single:
+ vfio_pci_post_reset(vdev);
+ g_free(info);
+
+ return ret;
+}
+
+/*
+ * We want to differentiate hot reset of mulitple in-use devices vs hot reset
+ * of a single in-use device. VFIO_DEVICE_RESET will already handle the case
+ * of doing hot resets when there is only a single device per bus. The in-use
+ * here refers to how many VFIODevices are affected. A hot reset that affects
+ * multiple devices, but only a single in-use device, means that we can call
+ * it from our bus ->reset() callback since the extent is effectively a single
+ * device. This allows us to make use of it in the hotplug path. When there
+ * are multiple in-use devices, we can only trigger the hot reset during a
+ * system reset and thus from our reset handler. We separate _one vs _multi
+ * here so that we don't overlap and do a double reset on the system reset
+ * path where both our reset handler and ->reset() callback are used. Calling
+ * _one() will only do a hot reset for the one in-use devices case, calling
+ * _multi() will do nothing if a _one() would have been sufficient.
+ */
+static int vfio_pci_hot_reset_one(VFIODevice *vdev)
+{
+ return vfio_pci_hot_reset(vdev, true);
+}
+
+static int vfio_pci_hot_reset_multi(VFIODevice *vdev)
+{
+ return vfio_pci_hot_reset(vdev, false);
+}
+
+static void vfio_pci_reset_handler(void *opaque)
+{
+ VFIOGroup *group;
+ VFIODevice *vdev;
+
+ QLIST_FOREACH(group, &group_list, next) {
+ QLIST_FOREACH(vdev, &group->device_list, next) {
+ if (!vdev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
+ vdev->needs_reset = true;
+ }
+ }
+ }
+
+ QLIST_FOREACH(group, &group_list, next) {
+ QLIST_FOREACH(vdev, &group->device_list, next) {
+ if (vdev->needs_reset) {
+ vfio_pci_hot_reset_multi(vdev);
+ }
+ }
+ }
+}
+
+static void vfio_kvm_device_add_group(VFIOGroup *group)
+{
+#ifdef CONFIG_KVM
+ struct kvm_device_attr attr = {
+ .group = KVM_DEV_VFIO_GROUP,
+ .attr = KVM_DEV_VFIO_GROUP_ADD,
+ .addr = (uint64_t)(unsigned long)&group->fd,
+ };
+
+ if (!kvm_enabled()) {
+ return;
+ }
+
+ if (vfio_kvm_device_fd < 0) {
+ struct kvm_create_device cd = {
+ .type = KVM_DEV_TYPE_VFIO,
+ };
+
+ if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
+ DPRINTF("KVM_CREATE_DEVICE: %m\n");
+ return;
+ }
+
+ vfio_kvm_device_fd = cd.fd;
+ }
+
+ if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+ error_report("Failed to add group %d to KVM VFIO device: %m",
+ group->groupid);
+ }
+#endif
+}
+
+static void vfio_kvm_device_del_group(VFIOGroup *group)
+{
+#ifdef CONFIG_KVM
+ struct kvm_device_attr attr = {
+ .group = KVM_DEV_VFIO_GROUP,
+ .attr = KVM_DEV_VFIO_GROUP_DEL,
+ .addr = (uint64_t)(unsigned long)&group->fd,
+ };
+
+ if (vfio_kvm_device_fd < 0) {
+ return;
+ }
+
+ if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
+ error_report("Failed to remove group %d to KVM VFIO device: %m",
+ group->groupid);
+ }
+#endif
}
static int vfio_connect_container(VFIOGroup *group)
return -errno;
}
- container->iommu_data.listener = vfio_memory_listener;
+ container->iommu_data.type1.listener = vfio_memory_listener;
container->iommu_data.release = vfio_listener_release;
- memory_listener_register(&container->iommu_data.listener, &address_space_memory);
+ memory_listener_register(&container->iommu_data.type1.listener,
+ &address_space_memory);
+
+ if (container->iommu_data.type1.error) {
+ ret = container->iommu_data.type1.error;
+ vfio_listener_release(container);
+ g_free(container);
+ close(fd);
+ error_report("vfio: memory listener initialization failed for container\n");
+ return ret;
+ }
+
+ container->iommu_data.type1.initialized = true;
+
} else {
error_report("vfio: No available IOMMU models");
g_free(container);
return NULL;
}
+ if (QLIST_EMPTY(&group_list)) {
+ qemu_register_reset(vfio_pci_reset_handler, NULL);
+ }
+
QLIST_INSERT_HEAD(&group_list, group, next);
+ vfio_kvm_device_add_group(group);
+
return group;
}
return;
}
+ vfio_kvm_device_del_group(group);
vfio_disconnect_container(group);
QLIST_REMOVE(group, next);
DPRINTF("vfio_put_group: close group->fd\n");
close(group->fd);
g_free(group);
+
+ if (QLIST_EMPTY(&group_list)) {
+ qemu_unregister_reset(vfio_pci_reset_handler, NULL);
+ }
}
static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
}
vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
- if (!vdev->reset_works) {
- error_report("Warning, device %s does not support reset", name);
- }
if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
error_report("vfio: unexpected number of io regions %u",
QLIST_INIT(&vdev->bars[i].quirks);
}
- reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
-
- ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info);
- if (ret) {
- error_report("vfio: Error getting ROM info: %m");
- goto error;
- }
-
- DPRINTF("Device %s ROM:\n", name);
- DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
- (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
- (unsigned long)reg_info.flags);
-
- vdev->rom_size = reg_info.size;
- vdev->rom_offset = reg_info.offset;
-
reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info);
ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
if (ret) {
/* This can fail for an old kernel or legacy PCI dev */
- DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure ret=%d\n", ret);
+ DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure: %m\n");
ret = 0;
} else if (irq_info.count == 1) {
vdev->pci_aer = true;
} else {
- error_report("vfio: Warning: "
- "Could not enable error recovery for the device\n");
+ error_report("vfio: %04x:%02x:%02x.%x "
+ "Could not enable error recovery for the device",
+ vdev->host.domain, vdev->host.bus, vdev->host.slot,
+ vdev->host.function);
}
error:
* guest to contain the error.
*/
- error_report("%s (%04x:%02x:%02x.%x)"
- "Unrecoverable error detected...\n"
- "Please collect any data possible and then kill the guest",
- __func__, vdev->host.domain, vdev->host.bus,
- vdev->host.slot, vdev->host.function);
+ error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. "
+ "Please collect any data possible and then kill the guest",
+ __func__, vdev->host.domain, vdev->host.bus,
+ vdev->host.slot, vdev->host.function);
vm_stop(RUN_STATE_IO_ERROR);
}
}
if (event_notifier_init(&vdev->err_notifier, 0)) {
- error_report("vfio: Warning: "
- "Unable to init event notifier for error detection\n");
+ error_report("vfio: Unable to init event notifier for error detection");
vdev->pci_aer = false;
return;
}
ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
if (ret) {
- error_report("vfio: Failed to set up error notification\n");
+ error_report("vfio: Failed to set up error notification");
qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
event_notifier_cleanup(&vdev->err_notifier);
vdev->pci_aer = false;
ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
if (ret) {
- error_report("vfio: Failed to de-assign error fd: %d\n", ret);
+ error_report("vfio: Failed to de-assign error fd: %m");
}
g_free(irq_set);
qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
vdev->emulated_config_bits[PCI_HEADER_TYPE] =
PCI_HEADER_TYPE_MULTI_FUNCTION;
+ /* Restore or clear multifunction, this is always controlled by QEMU */
+ if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+ vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
+ } else {
+ vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
+ }
+
/*
* Clear host resource mapping info. If we choose not to register a
* BAR, such as might be the case with the option ROM, we can get
memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
- vfio_load_rom(vdev);
+ vfio_pci_size_rom(vdev);
ret = vfio_early_setup_msix(vdev);
if (ret) {
vfio_teardown_msi(vdev);
vfio_unmap_bars(vdev);
g_free(vdev->emulated_config_bits);
+ g_free(vdev->rom);
vfio_put_device(vdev);
vfio_put_group(group);
}
{
PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
- uint16_t cmd;
DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
vdev->host.bus, vdev->host.slot, vdev->host.function);
- vfio_disable_interrupts(vdev);
-
- /* Make sure the device is in D0 */
- if (vdev->pm_cap) {
- uint16_t pmcsr;
- uint8_t state;
+ vfio_pci_pre_reset(vdev);
- pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
- state = pmcsr & PCI_PM_CTRL_STATE_MASK;
- if (state) {
- pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
- vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
- /* vfio handles the necessary delay here */
- pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
- state = pmcsr & PCI_PM_CTRL_STATE_MASK;
- if (state) {
- error_report("vfio: Unable to power on device, stuck in D%d\n",
- state);
- }
- }
+ if (vdev->reset_works && (vdev->has_flr || !vdev->has_pm_reset) &&
+ !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
+ DPRINTF("%04x:%02x:%02x.%x FLR/VFIO_DEVICE_RESET\n", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+ goto post_reset;
}
- /*
- * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
- * Also put INTx Disable in known state.
- */
- cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
- cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
- PCI_COMMAND_INTX_DISABLE);
- vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
+ /* See if we can do our own bus reset */
+ if (!vfio_pci_hot_reset_one(vdev)) {
+ goto post_reset;
+ }
- if (vdev->reset_works) {
- if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
- error_report("vfio: Error unable to reset physical device "
- "(%04x:%02x:%02x.%x): %m", vdev->host.domain,
- vdev->host.bus, vdev->host.slot, vdev->host.function);
- }
+ /* If nothing else works and the device supports PM reset, use it */
+ if (vdev->reset_works && vdev->has_pm_reset &&
+ !ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
+ DPRINTF("%04x:%02x:%02x.%x PCI PM Reset\n", vdev->host.domain,
+ vdev->host.bus, vdev->host.slot, vdev->host.function);
+ goto post_reset;
}
- vfio_enable_intx(vdev);
+post_reset:
+ vfio_pci_post_reset(vdev);
}
static Property vfio_pci_dev_properties[] = {