*/
#include "qemu/osdep.h"
+#include "qemu/units.h"
#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
#include "qemu/range.h"
#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include <sys/ioctl.h>
#include "hw/nvram/fw_cfg.h"
#include "pci.h"
#include "trace.h"
uint32_t offset;
uint8_t bar;
MemoryRegion *mem;
+ uint8_t data[];
} VFIOConfigMirrorQuirk;
static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
.endianness = DEVICE_LITTLE_ENDIAN,
};
+static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
+{
+ VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
+ QLIST_INIT(&quirk->ioeventfds);
+ quirk->mem = g_new0(MemoryRegion, nr_mem);
+ quirk->nr_mem = nr_mem;
+
+ return quirk;
+}
+
+static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
+{
+ QLIST_REMOVE(ioeventfd, next);
+ memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
+ true, ioeventfd->data, &ioeventfd->e);
+
+ if (ioeventfd->vfio) {
+ struct vfio_device_ioeventfd vfio_ioeventfd;
+
+ vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
+ vfio_ioeventfd.flags = ioeventfd->size;
+ vfio_ioeventfd.data = ioeventfd->data;
+ vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
+ ioeventfd->region_addr;
+ vfio_ioeventfd.fd = -1;
+
+ if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) {
+ error_report("Failed to remove vfio ioeventfd for %s+0x%"
+ HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)",
+ memory_region_name(ioeventfd->mr), ioeventfd->addr,
+ ioeventfd->size, ioeventfd->data);
+ }
+ } else {
+ qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
+ NULL, NULL, NULL);
+ }
+
+ event_notifier_cleanup(&ioeventfd->e);
+ trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
+ (uint64_t)ioeventfd->addr, ioeventfd->size,
+ ioeventfd->data);
+ g_free(ioeventfd);
+}
+
+static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
+{
+ VFIOIOEventFD *ioeventfd, *tmp;
+
+ QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
+ if (ioeventfd->dynamic) {
+ vfio_ioeventfd_exit(vdev, ioeventfd);
+ }
+ }
+}
+
+static void vfio_ioeventfd_handler(void *opaque)
+{
+ VFIOIOEventFD *ioeventfd = opaque;
+
+ if (event_notifier_test_and_clear(&ioeventfd->e)) {
+ vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
+ ioeventfd->data, ioeventfd->size);
+ trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
+ (uint64_t)ioeventfd->addr, ioeventfd->size,
+ ioeventfd->data);
+ }
+}
+
+static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
+ MemoryRegion *mr, hwaddr addr,
+ unsigned size, uint64_t data,
+ VFIORegion *region,
+ hwaddr region_addr, bool dynamic)
+{
+ VFIOIOEventFD *ioeventfd;
+
+ if (vdev->no_kvm_ioeventfd) {
+ return NULL;
+ }
+
+ ioeventfd = g_malloc0(sizeof(*ioeventfd));
+
+ if (event_notifier_init(&ioeventfd->e, 0)) {
+ g_free(ioeventfd);
+ return NULL;
+ }
+
+ /*
+ * MemoryRegion and relative offset, plus additional ioeventfd setup
+ * parameters for configuring and later tearing down KVM ioeventfd.
+ */
+ ioeventfd->mr = mr;
+ ioeventfd->addr = addr;
+ ioeventfd->size = size;
+ ioeventfd->data = data;
+ ioeventfd->dynamic = dynamic;
+ /*
+ * VFIORegion and relative offset for implementing the userspace
+ * handler. data & size fields shared for both uses.
+ */
+ ioeventfd->region = region;
+ ioeventfd->region_addr = region_addr;
+
+ if (!vdev->no_vfio_ioeventfd) {
+ struct vfio_device_ioeventfd vfio_ioeventfd;
+
+ vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
+ vfio_ioeventfd.flags = ioeventfd->size;
+ vfio_ioeventfd.data = ioeventfd->data;
+ vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
+ ioeventfd->region_addr;
+ vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
+
+ ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
+ VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
+ }
+
+ if (!ioeventfd->vfio) {
+ qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
+ vfio_ioeventfd_handler, NULL, ioeventfd);
+ }
+
+ memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
+ true, ioeventfd->data, &ioeventfd->e);
+ trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
+ size, data, ioeventfd->vfio);
+
+ return ioeventfd;
+}
+
static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
{
VFIOQuirk *quirk;
return;
}
- quirk = g_malloc0(sizeof(*quirk));
- quirk->mem = g_new0(MemoryRegion, 1);
- quirk->nr_mem = 1;
+ quirk = vfio_quirk_alloc(1);
memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev,
"vfio-ati-3c3-quirk", 1);
return;
}
- quirk = g_malloc0(sizeof(*quirk));
- quirk->mem = g_new0(MemoryRegion, 2);
- quirk->nr_mem = 2;
+ quirk = vfio_quirk_alloc(2);
window = quirk->data = g_malloc0(sizeof(*window) +
sizeof(VFIOConfigWindowMatch));
window->vdev = vdev;
return;
}
- quirk = g_malloc0(sizeof(*quirk));
+ quirk = vfio_quirk_alloc(1);
mirror = quirk->data = g_malloc0(sizeof(*mirror));
- mirror->mem = quirk->mem = g_new0(MemoryRegion, 1);
- quirk->nr_mem = 1;
+ mirror->mem = quirk->mem;
mirror->vdev = vdev;
mirror->offset = 0x4000;
mirror->bar = nr;
* note it for future reference.
*/
-#define PCI_VENDOR_ID_NVIDIA 0x10de
-
/*
* Nvidia has several different methods to get to config space, the
* nouveu project has several of these documented here:
VFIOQuirk *quirk;
VFIONvidia3d0Quirk *data;
- if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
+ if (vdev->no_geforce_quirks ||
+ !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
!vdev->bars[1].region.size) {
return;
}
- quirk = g_malloc0(sizeof(*quirk));
+ quirk = vfio_quirk_alloc(2);
quirk->data = data = g_malloc0(sizeof(*data));
- quirk->mem = g_new0(MemoryRegion, 2);
- quirk->nr_mem = 2;
data->vdev = vdev;
memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk,
VFIONvidiaBAR5Quirk *bar5;
VFIOConfigWindowQuirk *window;
- if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
- !vdev->vga || nr != 5) {
+ if (vdev->no_geforce_quirks ||
+ !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
+ !vdev->vga || nr != 5 || !vdev->bars[5].ioport) {
return;
}
- quirk = g_malloc0(sizeof(*quirk));
- quirk->mem = g_new0(MemoryRegion, 4);
- quirk->nr_mem = 4;
+ quirk = vfio_quirk_alloc(4);
bar5 = quirk->data = g_malloc0(sizeof(*bar5) +
(sizeof(VFIOConfigWindowMatch) * 2));
window = &bar5->window;
trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
}
+typedef struct LastDataSet {
+ VFIOQuirk *quirk;
+ hwaddr addr;
+ uint64_t data;
+ unsigned size;
+ int hits;
+ int added;
+} LastDataSet;
+
+#define MAX_DYN_IOEVENTFD 10
+#define HITS_FOR_IOEVENTFD 10
+
/*
* Finally, BAR0 itself. We want to redirect any accesses to either
* 0x1800 or 0x88000 through the PCI config space access functions.
VFIOConfigMirrorQuirk *mirror = opaque;
VFIOPCIDevice *vdev = mirror->vdev;
PCIDevice *pdev = &vdev->pdev;
+ LastDataSet *last = (LastDataSet *)&mirror->data;
vfio_generic_quirk_mirror_write(opaque, addr, data, size);
addr + mirror->offset, data, size);
trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
}
+
+ /*
+ * Automatically add an ioeventfd to handle any repeated write with the
+ * same data and size above the standard PCI config space header. This is
+ * primarily expected to accelerate the MSI-ACK behavior, such as noted
+ * above. Current hardware/drivers should trigger an ioeventfd at config
+ * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
+ *
+ * The criteria of 10 successive hits is arbitrary but reliably adds the
+ * MSI-ACK region. Note that as some writes are bypassed via the ioeventfd,
+ * the remaining ones have a greater chance of being seen successively.
+ * To avoid the pathological case of burning up all of QEMU's open file
+ * handles, arbitrarily limit this algorithm from adding no more than 10
+ * ioeventfds, print an error if we would have added an 11th, and then
+ * stop counting.
+ */
+ if (!vdev->no_kvm_ioeventfd &&
+ addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
+ if (addr != last->addr || data != last->data || size != last->size) {
+ last->addr = addr;
+ last->data = data;
+ last->size = size;
+ last->hits = 1;
+ } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
+ if (last->added < MAX_DYN_IOEVENTFD) {
+ VFIOIOEventFD *ioeventfd;
+ ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
+ data, &vdev->bars[mirror->bar].region,
+ mirror->offset + addr, true);
+ if (ioeventfd) {
+ VFIOQuirk *quirk = last->quirk;
+
+ QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
+ last->added++;
+ }
+ } else {
+ last->added++;
+ warn_report("NVIDIA ioeventfd queue full for %s, unable to "
+ "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
+ "size %u", vdev->vbasedev.name, addr, data, size);
+ }
+ }
+ }
}
static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
.endianness = DEVICE_LITTLE_ENDIAN,
};
+static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
+{
+ VFIOConfigMirrorQuirk *mirror = quirk->data;
+ LastDataSet *last = (LastDataSet *)&mirror->data;
+
+ last->addr = last->data = last->size = last->hits = last->added = 0;
+
+ vfio_drop_dynamic_eventfds(vdev, quirk);
+}
+
static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
{
VFIOQuirk *quirk;
VFIOConfigMirrorQuirk *mirror;
+ LastDataSet *last;
- if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
+ if (vdev->no_geforce_quirks ||
+ !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
!vfio_is_vga(vdev) || nr != 0) {
return;
}
- quirk = g_malloc0(sizeof(*quirk));
- mirror = quirk->data = g_malloc0(sizeof(*mirror));
- mirror->mem = quirk->mem = g_new0(MemoryRegion, 1);
- quirk->nr_mem = 1;
+ quirk = vfio_quirk_alloc(1);
+ quirk->reset = vfio_nvidia_bar0_quirk_reset;
+ mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
+ mirror->mem = quirk->mem;
mirror->vdev = vdev;
mirror->offset = 0x88000;
mirror->bar = nr;
+ last = (LastDataSet *)&mirror->data;
+ last->quirk = quirk;
memory_region_init_io(mirror->mem, OBJECT(vdev),
&vfio_nvidia_mirror_quirk, mirror,
/* The 0x1800 offset mirror only seems to get used by legacy VGA */
if (vdev->vga) {
- quirk = g_malloc0(sizeof(*quirk));
- mirror = quirk->data = g_malloc0(sizeof(*mirror));
- mirror->mem = quirk->mem = g_new0(MemoryRegion, 1);
- quirk->nr_mem = 1;
+ quirk = vfio_quirk_alloc(1);
+ quirk->reset = vfio_nvidia_bar0_quirk_reset;
+ mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
+ mirror->mem = quirk->mem;
mirror->vdev = vdev;
mirror->offset = 0x1800;
mirror->bar = nr;
+ last = (LastDataSet *)&mirror->data;
+ last->quirk = quirk;
memory_region_init_io(mirror->mem, OBJECT(vdev),
&vfio_nvidia_mirror_quirk, mirror,
return;
}
- quirk = g_malloc0(sizeof(*quirk));
- quirk->mem = g_new0(MemoryRegion, 2);
- quirk->nr_mem = 2;
+ quirk = vfio_quirk_alloc(2);
quirk->data = rtl = g_malloc0(sizeof(*rtl));
rtl->vdev = vdev;
.name = "vfio-pci-igd-lpc-bridge",
.parent = TYPE_PCI_DEVICE,
.class_init = vfio_pci_igd_lpc_bridge_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { },
+ },
};
static void vfio_pci_igd_register_types(void)
ggms = 1 << ggms;
}
- ggms *= 1024 * 1024;
+ ggms *= MiB;
- return (ggms / (4 * 1024)) * (gen < 8 ? 4 : 8);
+ return (ggms / (4 * KiB)) * (gen < 8 ? 4 : 8);
}
/*
uint16_t cmd_orig, cmd;
Error *err = NULL;
- /* This must be an Intel VGA device. */
- if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) ||
- !vfio_is_vga(vdev) || nr != 4) {
- return;
- }
-
- /*
- * IGD is not a standard, they like to change their specs often. We
- * only attempt to support back to SandBridge and we hope that newer
- * devices maintain compatibility with generation 8.
- */
- gen = igd_gen(vdev);
- if (gen != 6 && gen != 8) {
- error_report("IGD device %s is unsupported by IGD quirks, "
- "try SandyBridge or newer", vdev->vbasedev.name);
- return;
- }
-
/*
- * Regardless of running in UPT or legacy mode, the guest graphics
- * driver may attempt to use stolen memory, however only legacy mode
- * has BIOS support for reserving stolen memory in the guest VM.
- * Emulate the GMCH register in all cases and zero out the stolen
- * memory size here. Legacy mode may request allocation and re-write
- * this below.
+ * This must be an Intel VGA device at address 00:02.0 for us to even
+ * consider enabling legacy mode. The vBIOS has dependencies on the
+ * PCI bus address.
*/
- gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4);
- gmch &= ~((gen < 8 ? 0x1f : 0xff) << (gen < 8 ? 3 : 8));
-
- /* GMCH is read-only, emulated */
- pci_set_long(vdev->pdev.config + IGD_GMCH, gmch);
- pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0);
- pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0);
-
- /*
- * This must be at address 00:02.0 for us to even onsider enabling
- * legacy mode. The vBIOS has dependencies on the PCI bus address.
- */
- if (&vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev),
+ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) ||
+ !vfio_is_vga(vdev) || nr != 4 ||
+ &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev),
0, PCI_DEVFN(0x2, 0))) {
return;
}
return;
}
+ /*
+ * IGD is not a standard, they like to change their specs often. We
+ * only attempt to support back to SandBridge and we hope that newer
+ * devices maintain compatibility with generation 8.
+ */
+ gen = igd_gen(vdev);
+ if (gen != 6 && gen != 8) {
+ error_report("IGD device %s is unsupported in legacy mode, "
+ "try SandyBridge or newer", vdev->vbasedev.name);
+ return;
+ }
+
/*
* Most of what we're doing here is to enable the ROM to run, so if
* there's no ROM, there's no point in setting up this quirk.
goto out;
}
+ gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4);
+
/*
* If IGD VGA Disable is clear (expected) and VGA is not already enabled,
* try to enable it. Probably shouldn't be using legacy mode without VGA,
* but also no point in us enabling VGA if disabled in hardware.
*/
if (!(gmch & 0x2) && !vdev->vga && vfio_populate_vga(vdev, &err)) {
- error_reportf_err(err, ERR_PREFIX, vdev->vbasedev.name);
+ error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
error_report("IGD device %s failed to enable VGA access, "
"legacy mode disabled", vdev->vbasedev.name);
goto out;
ret = vfio_pci_igd_opregion_init(vdev, opregion, &err);
if (ret) {
error_append_hint(&err, "IGD legacy mode disabled\n");
- error_reportf_err(err, ERR_PREFIX, vdev->vbasedev.name);
+ error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
goto out;
}
/* Setup our quirk to munge GTT addresses to the VM allocated buffer */
- quirk = g_malloc0(sizeof(*quirk));
- quirk->mem = g_new0(MemoryRegion, 2);
- quirk->nr_mem = 2;
+ quirk = vfio_quirk_alloc(2);
igd = quirk->data = g_malloc0(sizeof(*igd));
igd->vdev = vdev;
igd->index = ~0;
igd->bdsm = vfio_pci_read_config(&vdev->pdev, IGD_BDSM, 4);
- igd->bdsm &= ~((1 << 20) - 1); /* 1MB aligned */
+ igd->bdsm &= ~((1 * MiB) - 1); /* 1MB aligned */
memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_igd_index_quirk,
igd, "vfio-igd-index-quirk", 4);
* when IVD (IGD VGA Disable) is clear, but the claim is that it's unused,
* so let's not waste VM memory for it.
*/
+ gmch &= ~((gen < 8 ? 0x1f : 0xff) << (gen < 8 ? 3 : 8));
+
if (vdev->igd_gms) {
if (vdev->igd_gms <= 0x10) {
gms_mb = vdev->igd_gms * 32;
gmch |= vdev->igd_gms << (gen < 8 ? 3 : 8);
- pci_set_long(vdev->pdev.config + IGD_GMCH, gmch);
} else {
error_report("Unsupported IGD GMS value 0x%x", vdev->igd_gms);
vdev->igd_gms = 0;
* config offset 0x5C.
*/
bdsm_size = g_malloc(sizeof(*bdsm_size));
- *bdsm_size = cpu_to_le64((ggms_mb + gms_mb) * 1024 * 1024);
+ *bdsm_size = cpu_to_le64((ggms_mb + gms_mb) * MiB);
fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size",
bdsm_size, sizeof(*bdsm_size));
+ /* GMCH is read-only, emulated */
+ pci_set_long(vdev->pdev.config + IGD_GMCH, gmch);
+ pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0);
+ pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0);
+
/* BDSM is read-write, emulated. The BIOS needs to be able to write it */
pci_set_long(vdev->pdev.config + IGD_BDSM, 0);
pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0);
int i;
QLIST_FOREACH(quirk, &bar->quirks, next) {
+ while (!QLIST_EMPTY(&quirk->ioeventfds)) {
+ vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
+ }
+
for (i = 0; i < quirk->nr_mem; i++) {
memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
}
/*
* Reset quirks
*/
+void vfio_quirk_reset(VFIOPCIDevice *vdev)
+{
+ int i;
+
+ for (i = 0; i < PCI_ROM_SLOT; i++) {
+ VFIOQuirk *quirk;
+ VFIOBAR *bar = &vdev->bars[i];
+
+ QLIST_FOREACH(quirk, &bar->quirks, next) {
+ if (quirk->reset) {
+ quirk->reset(vdev, quirk);
+ }
+ }
+ }
+}
/*
* AMD Radeon PCI config reset, based on Linux:
break;
}
}
+
+/*
+ * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify
+ * devices as a member of a clique. Devices within the same clique ID
+ * are capable of direct P2P. It's the user's responsibility that this
+ * is correct. The spec says that this may reside at any unused config
+ * offset, but reserves and recommends hypervisors place this at C8h.
+ * The spec also states that the hypervisor should place this capability
+ * at the end of the capability list, thus next is defined as 0h.
+ *
+ * +----------------+----------------+----------------+----------------+
+ * | sig 7:0 ('P') | vndr len (8h) | next (0h) | cap id (9h) |
+ * +----------------+----------------+----------------+----------------+
+ * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)| sig 23:8 ('P2') |
+ * +---------------------------------+---------------------------------+
+ *
+ * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
+ */
+static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ DeviceState *dev = DEVICE(obj);
+ Property *prop = opaque;
+ uint8_t *ptr = qdev_get_prop_ptr(dev, prop);
+
+ visit_type_uint8(v, name, ptr, errp);
+}
+
+static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ DeviceState *dev = DEVICE(obj);
+ Property *prop = opaque;
+ uint8_t value, *ptr = qdev_get_prop_ptr(dev, prop);
+ Error *local_err = NULL;
+
+ if (dev->realized) {
+ qdev_prop_set_after_realize(dev, name, errp);
+ return;
+ }
+
+ visit_type_uint8(v, name, &value, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+
+ if (value & ~0xF) {
+ error_setg(errp, "Property %s: valid range 0-15", name);
+ return;
+ }
+
+ *ptr = value;
+}
+
+const PropertyInfo qdev_prop_nv_gpudirect_clique = {
+ .name = "uint4",
+ .description = "NVIDIA GPUDirect Clique ID (0 - 15)",
+ .get = get_nv_gpudirect_clique_id,
+ .set = set_nv_gpudirect_clique_id,
+};
+
+static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
+{
+ PCIDevice *pdev = &vdev->pdev;
+ int ret, pos = 0xC8;
+
+ if (vdev->nv_gpudirect_clique == 0xFF) {
+ return 0;
+ }
+
+ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
+ error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor");
+ return -EINVAL;
+ }
+
+ if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) !=
+ PCI_BASE_CLASS_DISPLAY) {
+ error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
+ return -EINVAL;
+ }
+
+ ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
+ if (ret < 0) {
+ error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
+ return ret;
+ }
+
+ memset(vdev->emulated_config_bits + pos, 0xFF, 8);
+ pos += PCI_CAP_FLAGS;
+ pci_set_byte(pdev->config + pos++, 8);
+ pci_set_byte(pdev->config + pos++, 'P');
+ pci_set_byte(pdev->config + pos++, '2');
+ pci_set_byte(pdev->config + pos++, 'P');
+ pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3);
+ pci_set_byte(pdev->config + pos, 0);
+
+ return 0;
+}
+
+int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
+{
+ int ret;
+
+ ret = vfio_add_nv_gpudirect_cap(vdev, errp);
+ if (ret) {
+ return ret;
+ }
+
+ return 0;
+}