1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
8 #define pr_fmt(fmt) "AMD-Vi: " fmt
9 #define dev_fmt(fmt) pr_fmt(fmt)
11 #include <linux/ratelimit.h>
12 #include <linux/pci.h>
13 #include <linux/acpi.h>
14 #include <linux/amba/bus.h>
15 #include <linux/platform_device.h>
16 #include <linux/pci-ats.h>
17 #include <linux/bitmap.h>
18 #include <linux/slab.h>
19 #include <linux/debugfs.h>
20 #include <linux/scatterlist.h>
21 #include <linux/dma-map-ops.h>
22 #include <linux/dma-direct.h>
23 #include <linux/dma-iommu.h>
24 #include <linux/iommu-helper.h>
25 #include <linux/delay.h>
26 #include <linux/amd-iommu.h>
27 #include <linux/notifier.h>
28 #include <linux/export.h>
29 #include <linux/irq.h>
30 #include <linux/msi.h>
31 #include <linux/irqdomain.h>
32 #include <linux/percpu.h>
33 #include <linux/io-pgtable.h>
34 #include <linux/cc_platform.h>
35 #include <asm/irq_remapping.h>
36 #include <asm/io_apic.h>
38 #include <asm/hw_irq.h>
39 #include <asm/proto.h>
40 #include <asm/iommu.h>
44 #include "amd_iommu.h"
45 #include "../irq_remapping.h"
47 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
49 #define LOOP_TIMEOUT 100000
51 /* IO virtual address start page frame number */
52 #define IOVA_START_PFN (1)
53 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
55 /* Reserved IOVA ranges */
56 #define MSI_RANGE_START (0xfee00000)
57 #define MSI_RANGE_END (0xfeefffff)
58 #define HT_RANGE_START (0xfd00000000ULL)
59 #define HT_RANGE_END (0xffffffffffULL)
61 #define DEFAULT_PGTABLE_LEVEL PAGE_MODE_3_LEVEL
63 static DEFINE_SPINLOCK(pd_bitmap_lock);
65 LIST_HEAD(ioapic_map);
67 LIST_HEAD(acpihid_map);
70 * Domain for untranslated devices - only allocated
71 * if iommu=pt passed on kernel cmd line.
73 const struct iommu_ops amd_iommu_ops;
75 static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
76 int amd_iommu_max_glx_val = -1;
79 * general struct to manage commands send to an IOMMU
85 struct kmem_cache *amd_iommu_irq_cache;
87 static void detach_device(struct device *dev);
89 /****************************************************************************
93 ****************************************************************************/
95 static inline int get_acpihid_device_id(struct device *dev,
96 struct acpihid_map_entry **entry)
98 struct acpi_device *adev = ACPI_COMPANION(dev);
99 struct acpihid_map_entry *p;
104 list_for_each_entry(p, &acpihid_map, list) {
105 if (acpi_dev_hid_uid_match(adev, p->hid,
106 p->uid[0] ? p->uid : NULL)) {
115 static inline int get_device_sbdf_id(struct device *dev)
120 sbdf = get_pci_sbdf_id(to_pci_dev(dev));
122 sbdf = get_acpihid_device_id(dev, NULL);
127 struct dev_table_entry *get_dev_table(struct amd_iommu *iommu)
129 struct dev_table_entry *dev_table;
130 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
132 BUG_ON(pci_seg == NULL);
133 dev_table = pci_seg->dev_table;
134 BUG_ON(dev_table == NULL);
139 static inline u16 get_device_segment(struct device *dev)
143 if (dev_is_pci(dev)) {
144 struct pci_dev *pdev = to_pci_dev(dev);
146 seg = pci_domain_nr(pdev->bus);
148 u32 devid = get_acpihid_device_id(dev, NULL);
150 seg = PCI_SBDF_TO_SEGID(devid);
156 /* Writes the specific IOMMU for a device into the PCI segment rlookup table */
157 void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid)
159 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
161 pci_seg->rlookup_table[devid] = iommu;
164 static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid)
166 struct amd_iommu_pci_seg *pci_seg;
168 for_each_pci_segment(pci_seg) {
169 if (pci_seg->id == seg)
170 return pci_seg->rlookup_table[devid];
175 static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
177 u16 seg = get_device_segment(dev);
178 int devid = get_device_sbdf_id(dev);
182 return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
185 static struct protection_domain *to_pdomain(struct iommu_domain *dom)
187 return container_of(dom, struct protection_domain, domain);
190 static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
192 struct iommu_dev_data *dev_data;
193 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
195 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
199 spin_lock_init(&dev_data->lock);
200 dev_data->devid = devid;
201 ratelimit_default_init(&dev_data->rs);
203 llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list);
207 static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
209 struct iommu_dev_data *dev_data;
210 struct llist_node *node;
211 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
213 if (llist_empty(&pci_seg->dev_data_list))
216 node = pci_seg->dev_data_list.first;
217 llist_for_each_entry(dev_data, node, dev_data_list) {
218 if (dev_data->devid == devid)
225 static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
227 struct amd_iommu *iommu;
228 struct dev_table_entry *dev_table;
229 u16 devid = pci_dev_id(pdev);
234 iommu = rlookup_amd_iommu(&pdev->dev);
238 amd_iommu_set_rlookup_table(iommu, alias);
239 dev_table = get_dev_table(iommu);
240 memcpy(dev_table[alias].data,
241 dev_table[devid].data,
242 sizeof(dev_table[alias].data));
247 static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
249 struct pci_dev *pdev;
251 if (!dev_is_pci(dev))
253 pdev = to_pci_dev(dev);
256 * The IVRS alias stored in the alias table may not be
257 * part of the PCI DMA aliases if it's bus differs
258 * from the original device.
260 clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL);
262 pci_for_each_dma_alias(pdev, clone_alias, NULL);
265 static void setup_aliases(struct amd_iommu *iommu, struct device *dev)
267 struct pci_dev *pdev = to_pci_dev(dev);
268 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
271 /* For ACPI HID devices, there are no aliases */
272 if (!dev_is_pci(dev))
276 * Add the IVRS alias to the pci aliases if it is on the same
277 * bus. The IVRS table may know about a quirk that we don't.
279 ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)];
280 if (ivrs_alias != pci_dev_id(pdev) &&
281 PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
282 pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1);
284 clone_aliases(iommu, dev);
287 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid)
289 struct iommu_dev_data *dev_data;
291 dev_data = search_dev_data(iommu, devid);
293 if (dev_data == NULL) {
294 dev_data = alloc_dev_data(iommu, devid);
298 if (translation_pre_enabled(iommu))
299 dev_data->defer_attach = true;
306 * Find or create an IOMMU group for a acpihid device.
308 static struct iommu_group *acpihid_device_group(struct device *dev)
310 struct acpihid_map_entry *p, *entry = NULL;
313 devid = get_acpihid_device_id(dev, &entry);
315 return ERR_PTR(devid);
317 list_for_each_entry(p, &acpihid_map, list) {
318 if ((devid == p->devid) && p->group)
319 entry->group = p->group;
323 entry->group = generic_device_group(dev);
325 iommu_group_ref_get(entry->group);
330 static bool pci_iommuv2_capable(struct pci_dev *pdev)
332 static const int caps[] = {
334 PCI_EXT_CAP_ID_PASID,
338 if (!pci_ats_supported(pdev))
341 for (i = 0; i < 2; ++i) {
342 pos = pci_find_ext_capability(pdev, caps[i]);
351 * This function checks if the driver got a valid device from the caller to
352 * avoid dereferencing invalid pointers.
354 static bool check_device(struct device *dev)
356 struct amd_iommu_pci_seg *pci_seg;
357 struct amd_iommu *iommu;
363 sbdf = get_device_sbdf_id(dev);
366 devid = PCI_SBDF_TO_DEVID(sbdf);
368 iommu = rlookup_amd_iommu(dev);
372 /* Out of our scope? */
373 pci_seg = iommu->pci_seg;
374 if (devid > pci_seg->last_bdf)
380 static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
382 struct iommu_dev_data *dev_data;
385 if (dev_iommu_priv_get(dev))
388 sbdf = get_device_sbdf_id(dev);
392 devid = PCI_SBDF_TO_DEVID(sbdf);
393 dev_data = find_dev_data(iommu, devid);
398 setup_aliases(iommu, dev);
401 * By default we use passthrough mode for IOMMUv2 capable device.
402 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
403 * invalid address), we ignore the capability for the device so
404 * it'll be forced to go into translation mode.
406 if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
407 dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) {
408 dev_data->iommu_v2 = iommu->is_iommu_v2;
411 dev_iommu_priv_set(dev, dev_data);
416 static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
418 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
419 struct dev_table_entry *dev_table = get_dev_table(iommu);
422 sbdf = get_device_sbdf_id(dev);
426 devid = PCI_SBDF_TO_DEVID(sbdf);
427 pci_seg->rlookup_table[devid] = NULL;
428 memset(&dev_table[devid], 0, sizeof(struct dev_table_entry));
430 setup_aliases(iommu, dev);
433 static void amd_iommu_uninit_device(struct device *dev)
435 struct iommu_dev_data *dev_data;
437 dev_data = dev_iommu_priv_get(dev);
441 if (dev_data->domain)
444 dev_iommu_priv_set(dev, NULL);
447 * We keep dev_data around for unplugged devices and reuse it when the
448 * device is re-plugged - not doing so would introduce a ton of races.
452 /****************************************************************************
454 * Interrupt handling functions
456 ****************************************************************************/
458 static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
461 struct dev_table_entry *dev_table = get_dev_table(iommu);
463 for (i = 0; i < 4; ++i)
464 pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
467 static void dump_command(unsigned long phys_addr)
469 struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
472 for (i = 0; i < 4; ++i)
473 pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
476 static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event)
478 struct iommu_dev_data *dev_data = NULL;
479 int devid, vmg_tag, flags;
480 struct pci_dev *pdev;
483 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
484 vmg_tag = (event[1]) & 0xFFFF;
485 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
486 spa = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
488 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
491 dev_data = dev_iommu_priv_get(&pdev->dev);
494 if (__ratelimit(&dev_data->rs)) {
495 pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
496 vmg_tag, spa, flags);
499 pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
500 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
501 vmg_tag, spa, flags);
508 static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event)
510 struct iommu_dev_data *dev_data = NULL;
511 int devid, flags_rmp, vmg_tag, flags;
512 struct pci_dev *pdev;
515 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
516 flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
517 vmg_tag = (event[1]) & 0xFFFF;
518 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
519 gpa = ((u64)event[3] << 32) | event[2];
521 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
524 dev_data = dev_iommu_priv_get(&pdev->dev);
527 if (__ratelimit(&dev_data->rs)) {
528 pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
529 vmg_tag, gpa, flags_rmp, flags);
532 pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
533 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
534 vmg_tag, gpa, flags_rmp, flags);
541 #define IS_IOMMU_MEM_TRANSACTION(flags) \
542 (((flags) & EVENT_FLAG_I) == 0)
544 #define IS_WRITE_REQUEST(flags) \
545 ((flags) & EVENT_FLAG_RW)
547 static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
548 u16 devid, u16 domain_id,
549 u64 address, int flags)
551 struct iommu_dev_data *dev_data = NULL;
552 struct pci_dev *pdev;
554 pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
557 dev_data = dev_iommu_priv_get(&pdev->dev);
561 * If this is a DMA fault (for which the I(nterrupt)
562 * bit will be unset), allow report_iommu_fault() to
563 * prevent logging it.
565 if (IS_IOMMU_MEM_TRANSACTION(flags)) {
566 if (!report_iommu_fault(&dev_data->domain->domain,
568 IS_WRITE_REQUEST(flags) ?
574 if (__ratelimit(&dev_data->rs)) {
575 pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
576 domain_id, address, flags);
579 pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
580 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
581 domain_id, address, flags);
589 static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
591 struct device *dev = iommu->iommu.dev;
592 int type, devid, flags, tag;
593 volatile u32 *event = __evt;
599 type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
600 devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
601 pasid = (event[0] & EVENT_DOMID_MASK_HI) |
602 (event[1] & EVENT_DOMID_MASK_LO);
603 flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
604 address = (u64)(((u64)event[3]) << 32) | event[2];
607 /* Did we hit the erratum? */
608 if (++count == LOOP_TIMEOUT) {
609 pr_err("No event written to event log\n");
616 if (type == EVENT_TYPE_IO_FAULT) {
617 amd_iommu_report_page_fault(iommu, devid, pasid, address, flags);
622 case EVENT_TYPE_ILL_DEV:
623 dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
624 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
625 pasid, address, flags);
626 dump_dte_entry(iommu, devid);
628 case EVENT_TYPE_DEV_TAB_ERR:
629 dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x "
630 "address=0x%llx flags=0x%04x]\n",
631 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
634 case EVENT_TYPE_PAGE_TAB_ERR:
635 dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
636 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
637 pasid, address, flags);
639 case EVENT_TYPE_ILL_CMD:
640 dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
641 dump_command(address);
643 case EVENT_TYPE_CMD_HARD_ERR:
644 dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
647 case EVENT_TYPE_IOTLB_INV_TO:
648 dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n",
649 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
652 case EVENT_TYPE_INV_DEV_REQ:
653 dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
654 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
655 pasid, address, flags);
657 case EVENT_TYPE_RMP_FAULT:
658 amd_iommu_report_rmp_fault(iommu, event);
660 case EVENT_TYPE_RMP_HW_ERR:
661 amd_iommu_report_rmp_hw_error(iommu, event);
663 case EVENT_TYPE_INV_PPR_REQ:
664 pasid = PPR_PASID(*((u64 *)__evt));
665 tag = event[1] & 0x03FF;
666 dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
667 iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
668 pasid, address, flags, tag);
671 dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
672 event[0], event[1], event[2], event[3]);
675 memset(__evt, 0, 4 * sizeof(u32));
678 static void iommu_poll_events(struct amd_iommu *iommu)
682 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
683 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
685 while (head != tail) {
686 iommu_print_event(iommu, iommu->evt_buf + head);
687 head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
690 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
693 static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
695 struct amd_iommu_fault fault;
697 if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
698 pr_err_ratelimited("Unknown PPR request received\n");
702 fault.address = raw[1];
703 fault.pasid = PPR_PASID(raw[0]);
704 fault.sbdf = PCI_SEG_DEVID_TO_SBDF(iommu->pci_seg->id, PPR_DEVID(raw[0]));
705 fault.tag = PPR_TAG(raw[0]);
706 fault.flags = PPR_FLAGS(raw[0]);
708 atomic_notifier_call_chain(&ppr_notifier, 0, &fault);
711 static void iommu_poll_ppr_log(struct amd_iommu *iommu)
715 if (iommu->ppr_log == NULL)
718 head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
719 tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
721 while (head != tail) {
726 raw = (u64 *)(iommu->ppr_log + head);
729 * Hardware bug: Interrupt may arrive before the entry is
730 * written to memory. If this happens we need to wait for the
733 for (i = 0; i < LOOP_TIMEOUT; ++i) {
734 if (PPR_REQ_TYPE(raw[0]) != 0)
739 /* Avoid memcpy function-call overhead */
744 * To detect the hardware bug we need to clear the entry
747 raw[0] = raw[1] = 0UL;
749 /* Update head pointer of hardware ring-buffer */
750 head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
751 writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
753 /* Handle PPR entry */
754 iommu_handle_ppr_entry(iommu, entry);
756 /* Refresh ring-buffer information */
757 head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
758 tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
762 #ifdef CONFIG_IRQ_REMAP
763 static int (*iommu_ga_log_notifier)(u32);
765 int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
767 iommu_ga_log_notifier = notifier;
771 EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
773 static void iommu_poll_ga_log(struct amd_iommu *iommu)
775 u32 head, tail, cnt = 0;
777 if (iommu->ga_log == NULL)
780 head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
781 tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
783 while (head != tail) {
787 raw = (u64 *)(iommu->ga_log + head);
790 /* Avoid memcpy function-call overhead */
793 /* Update head pointer of hardware ring-buffer */
794 head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
795 writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
797 /* Handle GA entry */
798 switch (GA_REQ_TYPE(log_entry)) {
800 if (!iommu_ga_log_notifier)
803 pr_debug("%s: devid=%#x, ga_tag=%#x\n",
804 __func__, GA_DEVID(log_entry),
807 if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
808 pr_err("GA log notifier failed.\n");
817 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
819 if (!irq_remapping_enabled || !dev_is_pci(dev) ||
820 pci_dev_has_special_msi_domain(to_pci_dev(dev)))
823 dev_set_msi_domain(dev, iommu->msi_domain);
826 #else /* CONFIG_IRQ_REMAP */
828 amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
829 #endif /* !CONFIG_IRQ_REMAP */
831 #define AMD_IOMMU_INT_MASK \
832 (MMIO_STATUS_EVT_OVERFLOW_INT_MASK | \
833 MMIO_STATUS_EVT_INT_MASK | \
834 MMIO_STATUS_PPR_INT_MASK | \
835 MMIO_STATUS_GALOG_INT_MASK)
837 irqreturn_t amd_iommu_int_thread(int irq, void *data)
839 struct amd_iommu *iommu = (struct amd_iommu *) data;
840 u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
842 while (status & AMD_IOMMU_INT_MASK) {
843 /* Enable interrupt sources again */
844 writel(AMD_IOMMU_INT_MASK,
845 iommu->mmio_base + MMIO_STATUS_OFFSET);
847 if (status & MMIO_STATUS_EVT_INT_MASK) {
848 pr_devel("Processing IOMMU Event Log\n");
849 iommu_poll_events(iommu);
852 if (status & MMIO_STATUS_PPR_INT_MASK) {
853 pr_devel("Processing IOMMU PPR Log\n");
854 iommu_poll_ppr_log(iommu);
857 #ifdef CONFIG_IRQ_REMAP
858 if (status & MMIO_STATUS_GALOG_INT_MASK) {
859 pr_devel("Processing IOMMU GA Log\n");
860 iommu_poll_ga_log(iommu);
864 if (status & MMIO_STATUS_EVT_OVERFLOW_INT_MASK) {
865 pr_info_ratelimited("IOMMU event log overflow\n");
866 amd_iommu_restart_event_logging(iommu);
870 * Hardware bug: ERBT1312
871 * When re-enabling interrupt (by writing 1
872 * to clear the bit), the hardware might also try to set
873 * the interrupt bit in the event status register.
874 * In this scenario, the bit will be set, and disable
875 * subsequent interrupts.
877 * Workaround: The IOMMU driver should read back the
878 * status register and check if the interrupt bits are cleared.
879 * If not, driver will need to go through the interrupt handler
880 * again and re-clear the bits
882 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
887 irqreturn_t amd_iommu_int_handler(int irq, void *data)
889 return IRQ_WAKE_THREAD;
892 /****************************************************************************
894 * IOMMU command queuing functions
896 ****************************************************************************/
898 static int wait_on_sem(struct amd_iommu *iommu, u64 data)
902 while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
907 if (i == LOOP_TIMEOUT) {
908 pr_alert("Completion-Wait loop timed out\n");
915 static void copy_cmd_to_buffer(struct amd_iommu *iommu,
916 struct iommu_cmd *cmd)
921 /* Copy command to buffer */
922 tail = iommu->cmd_buf_tail;
923 target = iommu->cmd_buf + tail;
924 memcpy(target, cmd, sizeof(*cmd));
926 tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
927 iommu->cmd_buf_tail = tail;
929 /* Tell the IOMMU about it */
930 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
933 static void build_completion_wait(struct iommu_cmd *cmd,
934 struct amd_iommu *iommu,
937 u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
939 memset(cmd, 0, sizeof(*cmd));
940 cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
941 cmd->data[1] = upper_32_bits(paddr);
943 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
946 static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
948 memset(cmd, 0, sizeof(*cmd));
949 cmd->data[0] = devid;
950 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
954 * Builds an invalidation address which is suitable for one page or multiple
955 * pages. Sets the size bit (S) as needed is more than one page is flushed.
957 static inline u64 build_inv_address(u64 address, size_t size)
959 u64 pages, end, msb_diff;
961 pages = iommu_num_pages(address, size, PAGE_SIZE);
964 return address & PAGE_MASK;
966 end = address + size - 1;
969 * msb_diff would hold the index of the most significant bit that
970 * flipped between the start and end.
972 msb_diff = fls64(end ^ address) - 1;
975 * Bits 63:52 are sign extended. If for some reason bit 51 is different
976 * between the start and the end, invalidate everything.
978 if (unlikely(msb_diff > 51)) {
979 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
982 * The msb-bit must be clear on the address. Just set all the
985 address |= (1ull << msb_diff) - 1;
988 /* Clear bits 11:0 */
989 address &= PAGE_MASK;
991 /* Set the size bit - we flush more than one 4kb page */
992 return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
995 static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
996 size_t size, u16 domid, int pde)
998 u64 inv_address = build_inv_address(address, size);
1000 memset(cmd, 0, sizeof(*cmd));
1001 cmd->data[1] |= domid;
1002 cmd->data[2] = lower_32_bits(inv_address);
1003 cmd->data[3] = upper_32_bits(inv_address);
1004 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1005 if (pde) /* PDE bit - we want to flush everything, not only the PTEs */
1006 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1009 static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
1010 u64 address, size_t size)
1012 u64 inv_address = build_inv_address(address, size);
1014 memset(cmd, 0, sizeof(*cmd));
1015 cmd->data[0] = devid;
1016 cmd->data[0] |= (qdep & 0xff) << 24;
1017 cmd->data[1] = devid;
1018 cmd->data[2] = lower_32_bits(inv_address);
1019 cmd->data[3] = upper_32_bits(inv_address);
1020 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1023 static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, u32 pasid,
1024 u64 address, bool size)
1026 memset(cmd, 0, sizeof(*cmd));
1028 address &= ~(0xfffULL);
1030 cmd->data[0] = pasid;
1031 cmd->data[1] = domid;
1032 cmd->data[2] = lower_32_bits(address);
1033 cmd->data[3] = upper_32_bits(address);
1034 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1035 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1037 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
1038 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1041 static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1042 int qdep, u64 address, bool size)
1044 memset(cmd, 0, sizeof(*cmd));
1046 address &= ~(0xfffULL);
1048 cmd->data[0] = devid;
1049 cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
1050 cmd->data[0] |= (qdep & 0xff) << 24;
1051 cmd->data[1] = devid;
1052 cmd->data[1] |= (pasid & 0xff) << 16;
1053 cmd->data[2] = lower_32_bits(address);
1054 cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1055 cmd->data[3] = upper_32_bits(address);
1057 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
1058 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1061 static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1062 int status, int tag, bool gn)
1064 memset(cmd, 0, sizeof(*cmd));
1066 cmd->data[0] = devid;
1068 cmd->data[1] = pasid;
1069 cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK;
1071 cmd->data[3] = tag & 0x1ff;
1072 cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
1074 CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
1077 static void build_inv_all(struct iommu_cmd *cmd)
1079 memset(cmd, 0, sizeof(*cmd));
1080 CMD_SET_TYPE(cmd, CMD_INV_ALL);
1083 static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
1085 memset(cmd, 0, sizeof(*cmd));
1086 cmd->data[0] = devid;
1087 CMD_SET_TYPE(cmd, CMD_INV_IRT);
1091 * Writes the command to the IOMMUs command buffer and informs the
1092 * hardware about the new command.
1094 static int __iommu_queue_command_sync(struct amd_iommu *iommu,
1095 struct iommu_cmd *cmd,
1098 unsigned int count = 0;
1099 u32 left, next_tail;
1101 next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1103 left = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
1106 /* Skip udelay() the first time around */
1108 if (count == LOOP_TIMEOUT) {
1109 pr_err("Command buffer timeout\n");
1116 /* Update head and recheck remaining space */
1117 iommu->cmd_buf_head = readl(iommu->mmio_base +
1118 MMIO_CMD_HEAD_OFFSET);
1123 copy_cmd_to_buffer(iommu, cmd);
1125 /* Do we need to make sure all commands are processed? */
1126 iommu->need_sync = sync;
1131 static int iommu_queue_command_sync(struct amd_iommu *iommu,
1132 struct iommu_cmd *cmd,
1135 unsigned long flags;
1138 raw_spin_lock_irqsave(&iommu->lock, flags);
1139 ret = __iommu_queue_command_sync(iommu, cmd, sync);
1140 raw_spin_unlock_irqrestore(&iommu->lock, flags);
1145 static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
1147 return iommu_queue_command_sync(iommu, cmd, true);
1151 * This function queues a completion wait command into the command
1152 * buffer of an IOMMU
1154 static int iommu_completion_wait(struct amd_iommu *iommu)
1156 struct iommu_cmd cmd;
1157 unsigned long flags;
1161 if (!iommu->need_sync)
1164 raw_spin_lock_irqsave(&iommu->lock, flags);
1166 data = ++iommu->cmd_sem_val;
1167 build_completion_wait(&cmd, iommu, data);
1169 ret = __iommu_queue_command_sync(iommu, &cmd, false);
1173 ret = wait_on_sem(iommu, data);
1176 raw_spin_unlock_irqrestore(&iommu->lock, flags);
1181 static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
1183 struct iommu_cmd cmd;
1185 build_inv_dte(&cmd, devid);
1187 return iommu_queue_command(iommu, &cmd);
1190 static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
1193 u16 last_bdf = iommu->pci_seg->last_bdf;
1195 for (devid = 0; devid <= last_bdf; ++devid)
1196 iommu_flush_dte(iommu, devid);
1198 iommu_completion_wait(iommu);
1202 * This function uses heavy locking and may disable irqs for some time. But
1203 * this is no issue because it is only called during resume.
1205 static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
1208 u16 last_bdf = iommu->pci_seg->last_bdf;
1210 for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
1211 struct iommu_cmd cmd;
1212 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1214 iommu_queue_command(iommu, &cmd);
1217 iommu_completion_wait(iommu);
1220 static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
1222 struct iommu_cmd cmd;
1224 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1226 iommu_queue_command(iommu, &cmd);
1228 iommu_completion_wait(iommu);
1231 static void amd_iommu_flush_all(struct amd_iommu *iommu)
1233 struct iommu_cmd cmd;
1235 build_inv_all(&cmd);
1237 iommu_queue_command(iommu, &cmd);
1238 iommu_completion_wait(iommu);
1241 static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
1243 struct iommu_cmd cmd;
1245 build_inv_irt(&cmd, devid);
1247 iommu_queue_command(iommu, &cmd);
1250 static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
1253 u16 last_bdf = iommu->pci_seg->last_bdf;
1255 for (devid = 0; devid <= last_bdf; devid++)
1256 iommu_flush_irt(iommu, devid);
1258 iommu_completion_wait(iommu);
1261 void iommu_flush_all_caches(struct amd_iommu *iommu)
1263 if (iommu_feature(iommu, FEATURE_IA)) {
1264 amd_iommu_flush_all(iommu);
1266 amd_iommu_flush_dte_all(iommu);
1267 amd_iommu_flush_irt_all(iommu);
1268 amd_iommu_flush_tlb_all(iommu);
1273 * Command send function for flushing on-device TLB
1275 static int device_flush_iotlb(struct iommu_dev_data *dev_data,
1276 u64 address, size_t size)
1278 struct amd_iommu *iommu;
1279 struct iommu_cmd cmd;
1282 qdep = dev_data->ats.qdep;
1283 iommu = rlookup_amd_iommu(dev_data->dev);
1287 build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address, size);
1289 return iommu_queue_command(iommu, &cmd);
1292 static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
1294 struct amd_iommu *iommu = data;
1296 return iommu_flush_dte(iommu, alias);
1300 * Command send function for invalidating a device table entry
1302 static int device_flush_dte(struct iommu_dev_data *dev_data)
1304 struct amd_iommu *iommu;
1305 struct pci_dev *pdev = NULL;
1306 struct amd_iommu_pci_seg *pci_seg;
1310 iommu = rlookup_amd_iommu(dev_data->dev);
1314 if (dev_is_pci(dev_data->dev))
1315 pdev = to_pci_dev(dev_data->dev);
1318 ret = pci_for_each_dma_alias(pdev,
1319 device_flush_dte_alias, iommu);
1321 ret = iommu_flush_dte(iommu, dev_data->devid);
1325 pci_seg = iommu->pci_seg;
1326 alias = pci_seg->alias_table[dev_data->devid];
1327 if (alias != dev_data->devid) {
1328 ret = iommu_flush_dte(iommu, alias);
1333 if (dev_data->ats.enabled)
1334 ret = device_flush_iotlb(dev_data, 0, ~0UL);
1340 * TLB invalidation function which is called from the mapping functions.
1341 * It invalidates a single PTE if the range to flush is within a single
1342 * page. Otherwise it flushes the whole TLB of the IOMMU.
1344 static void __domain_flush_pages(struct protection_domain *domain,
1345 u64 address, size_t size, int pde)
1347 struct iommu_dev_data *dev_data;
1348 struct iommu_cmd cmd;
1351 build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
1353 for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1354 if (!domain->dev_iommu[i])
1358 * Devices of this domain are behind this IOMMU
1359 * We need a TLB flush
1361 ret |= iommu_queue_command(amd_iommus[i], &cmd);
1364 list_for_each_entry(dev_data, &domain->dev_list, list) {
1366 if (!dev_data->ats.enabled)
1369 ret |= device_flush_iotlb(dev_data, address, size);
1375 static void domain_flush_pages(struct protection_domain *domain,
1376 u64 address, size_t size, int pde)
1378 if (likely(!amd_iommu_np_cache)) {
1379 __domain_flush_pages(domain, address, size, pde);
1384 * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
1385 * In such setups it is best to avoid flushes of ranges which are not
1386 * naturally aligned, since it would lead to flushes of unmodified
1387 * PTEs. Such flushes would require the hypervisor to do more work than
1388 * necessary. Therefore, perform repeated flushes of aligned ranges
1389 * until you cover the range. Each iteration flushes the smaller
1390 * between the natural alignment of the address that we flush and the
1391 * greatest naturally aligned region that fits in the range.
1394 int addr_alignment = __ffs(address);
1395 int size_alignment = __fls(size);
1400 * size is always non-zero, but address might be zero, causing
1401 * addr_alignment to be negative. As the casting of the
1402 * argument in __ffs(address) to long might trim the high bits
1403 * of the address on x86-32, cast to long when doing the check.
1405 if (likely((unsigned long)address != 0))
1406 min_alignment = min(addr_alignment, size_alignment);
1408 min_alignment = size_alignment;
1410 flush_size = 1ul << min_alignment;
1412 __domain_flush_pages(domain, address, flush_size, pde);
1413 address += flush_size;
1418 /* Flush the whole IO/TLB for a given protection domain - including PDE */
1419 void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain)
1421 domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
1424 void amd_iommu_domain_flush_complete(struct protection_domain *domain)
1428 for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1429 if (domain && !domain->dev_iommu[i])
1433 * Devices of this domain are behind this IOMMU
1434 * We need to wait for completion of all commands.
1436 iommu_completion_wait(amd_iommus[i]);
1440 /* Flush the not present cache if it exists */
1441 static void domain_flush_np_cache(struct protection_domain *domain,
1442 dma_addr_t iova, size_t size)
1444 if (unlikely(amd_iommu_np_cache)) {
1445 unsigned long flags;
1447 spin_lock_irqsave(&domain->lock, flags);
1448 domain_flush_pages(domain, iova, size, 1);
1449 amd_iommu_domain_flush_complete(domain);
1450 spin_unlock_irqrestore(&domain->lock, flags);
1456 * This function flushes the DTEs for all devices in domain
1458 static void domain_flush_devices(struct protection_domain *domain)
1460 struct iommu_dev_data *dev_data;
1462 list_for_each_entry(dev_data, &domain->dev_list, list)
1463 device_flush_dte(dev_data);
1466 /****************************************************************************
1468 * The next functions belong to the domain allocation. A domain is
1469 * allocated for every IOMMU as the default domain. If device isolation
1470 * is enabled, every device get its own domain. The most important thing
1471 * about domains is the page table mapping the DMA address space they
1474 ****************************************************************************/
1476 static u16 domain_id_alloc(void)
1480 spin_lock(&pd_bitmap_lock);
1481 id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1483 if (id > 0 && id < MAX_DOMAIN_ID)
1484 __set_bit(id, amd_iommu_pd_alloc_bitmap);
1487 spin_unlock(&pd_bitmap_lock);
1492 static void domain_id_free(int id)
1494 spin_lock(&pd_bitmap_lock);
1495 if (id > 0 && id < MAX_DOMAIN_ID)
1496 __clear_bit(id, amd_iommu_pd_alloc_bitmap);
1497 spin_unlock(&pd_bitmap_lock);
1500 static void free_gcr3_tbl_level1(u64 *tbl)
1505 for (i = 0; i < 512; ++i) {
1506 if (!(tbl[i] & GCR3_VALID))
1509 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1511 free_page((unsigned long)ptr);
1515 static void free_gcr3_tbl_level2(u64 *tbl)
1520 for (i = 0; i < 512; ++i) {
1521 if (!(tbl[i] & GCR3_VALID))
1524 ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1526 free_gcr3_tbl_level1(ptr);
1530 static void free_gcr3_table(struct protection_domain *domain)
1532 if (domain->glx == 2)
1533 free_gcr3_tbl_level2(domain->gcr3_tbl);
1534 else if (domain->glx == 1)
1535 free_gcr3_tbl_level1(domain->gcr3_tbl);
1537 BUG_ON(domain->glx != 0);
1539 free_page((unsigned long)domain->gcr3_tbl);
1542 static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
1543 struct protection_domain *domain, bool ats, bool ppr)
1548 struct dev_table_entry *dev_table = get_dev_table(iommu);
1550 if (domain->iop.mode != PAGE_MODE_NONE)
1551 pte_root = iommu_virt_to_phys(domain->iop.root);
1553 pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
1554 << DEV_ENTRY_MODE_SHIFT;
1556 pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
1559 * When SNP is enabled, Only set TV bit when IOMMU
1560 * page translation is in use.
1562 if (!amd_iommu_snp_en || (domain->id != 0))
1563 pte_root |= DTE_FLAG_TV;
1565 flags = dev_table[devid].data[1];
1568 flags |= DTE_FLAG_IOTLB;
1571 if (iommu_feature(iommu, FEATURE_EPHSUP))
1572 pte_root |= 1ULL << DEV_ENTRY_PPR;
1575 if (domain->flags & PD_IOMMUV2_MASK) {
1576 u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
1577 u64 glx = domain->glx;
1580 pte_root |= DTE_FLAG_GV;
1581 pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
1583 /* First mask out possible old values for GCR3 table */
1584 tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
1587 tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
1590 /* Encode GCR3 table into DTE */
1591 tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
1594 tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
1597 tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
1601 flags &= ~DEV_DOMID_MASK;
1602 flags |= domain->id;
1604 old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
1605 dev_table[devid].data[1] = flags;
1606 dev_table[devid].data[0] = pte_root;
1609 * A kdump kernel might be replacing a domain ID that was copied from
1610 * the previous kernel--if so, it needs to flush the translation cache
1611 * entries for the old domain ID that is being overwritten
1614 amd_iommu_flush_tlb_domid(iommu, old_domid);
1618 static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
1620 struct dev_table_entry *dev_table = get_dev_table(iommu);
1622 /* remove entry from the device table seen by the hardware */
1623 dev_table[devid].data[0] = DTE_FLAG_V;
1625 if (!amd_iommu_snp_en)
1626 dev_table[devid].data[0] |= DTE_FLAG_TV;
1628 dev_table[devid].data[1] &= DTE_FLAG_MASK;
1630 amd_iommu_apply_erratum_63(iommu, devid);
1633 static void do_attach(struct iommu_dev_data *dev_data,
1634 struct protection_domain *domain)
1636 struct amd_iommu *iommu;
1639 iommu = rlookup_amd_iommu(dev_data->dev);
1642 ats = dev_data->ats.enabled;
1644 /* Update data structures */
1645 dev_data->domain = domain;
1646 list_add(&dev_data->list, &domain->dev_list);
1648 /* Do reference counting */
1649 domain->dev_iommu[iommu->index] += 1;
1650 domain->dev_cnt += 1;
1652 /* Update device table */
1653 set_dte_entry(iommu, dev_data->devid, domain,
1654 ats, dev_data->iommu_v2);
1655 clone_aliases(iommu, dev_data->dev);
1657 device_flush_dte(dev_data);
1660 static void do_detach(struct iommu_dev_data *dev_data)
1662 struct protection_domain *domain = dev_data->domain;
1663 struct amd_iommu *iommu;
1665 iommu = rlookup_amd_iommu(dev_data->dev);
1669 /* Update data structures */
1670 dev_data->domain = NULL;
1671 list_del(&dev_data->list);
1672 clear_dte_entry(iommu, dev_data->devid);
1673 clone_aliases(iommu, dev_data->dev);
1675 /* Flush the DTE entry */
1676 device_flush_dte(dev_data);
1679 amd_iommu_domain_flush_tlb_pde(domain);
1681 /* Wait for the flushes to finish */
1682 amd_iommu_domain_flush_complete(domain);
1684 /* decrease reference counters - needs to happen after the flushes */
1685 domain->dev_iommu[iommu->index] -= 1;
1686 domain->dev_cnt -= 1;
1689 static void pdev_iommuv2_disable(struct pci_dev *pdev)
1691 pci_disable_ats(pdev);
1692 pci_disable_pri(pdev);
1693 pci_disable_pasid(pdev);
1696 static int pdev_iommuv2_enable(struct pci_dev *pdev)
1700 /* Only allow access to user-accessible pages */
1701 ret = pci_enable_pasid(pdev, 0);
1705 /* First reset the PRI state of the device */
1706 ret = pci_reset_pri(pdev);
1711 /* FIXME: Hardcode number of outstanding requests for now */
1712 ret = pci_enable_pri(pdev, 32);
1716 ret = pci_enable_ats(pdev, PAGE_SHIFT);
1723 pci_disable_pri(pdev);
1724 pci_disable_pasid(pdev);
1730 * If a device is not yet associated with a domain, this function makes the
1731 * device visible in the domain
1733 static int attach_device(struct device *dev,
1734 struct protection_domain *domain)
1736 struct iommu_dev_data *dev_data;
1737 struct pci_dev *pdev;
1738 unsigned long flags;
1741 spin_lock_irqsave(&domain->lock, flags);
1743 dev_data = dev_iommu_priv_get(dev);
1745 spin_lock(&dev_data->lock);
1748 if (dev_data->domain != NULL)
1751 if (!dev_is_pci(dev))
1752 goto skip_ats_check;
1754 pdev = to_pci_dev(dev);
1755 if (domain->flags & PD_IOMMUV2_MASK) {
1756 struct iommu_domain *def_domain = iommu_get_dma_domain(dev);
1759 if (def_domain->type != IOMMU_DOMAIN_IDENTITY)
1762 if (dev_data->iommu_v2) {
1763 if (pdev_iommuv2_enable(pdev) != 0)
1766 dev_data->ats.enabled = true;
1767 dev_data->ats.qdep = pci_ats_queue_depth(pdev);
1768 dev_data->pri_tlp = pci_prg_resp_pasid_required(pdev);
1770 } else if (amd_iommu_iotlb_sup &&
1771 pci_enable_ats(pdev, PAGE_SHIFT) == 0) {
1772 dev_data->ats.enabled = true;
1773 dev_data->ats.qdep = pci_ats_queue_depth(pdev);
1779 do_attach(dev_data, domain);
1782 * We might boot into a crash-kernel here. The crashed kernel
1783 * left the caches in the IOMMU dirty. So we have to flush
1784 * here to evict all dirty stuff.
1786 amd_iommu_domain_flush_tlb_pde(domain);
1788 amd_iommu_domain_flush_complete(domain);
1791 spin_unlock(&dev_data->lock);
1793 spin_unlock_irqrestore(&domain->lock, flags);
1799 * Removes a device from a protection domain (with devtable_lock held)
1801 static void detach_device(struct device *dev)
1803 struct protection_domain *domain;
1804 struct iommu_dev_data *dev_data;
1805 unsigned long flags;
1807 dev_data = dev_iommu_priv_get(dev);
1808 domain = dev_data->domain;
1810 spin_lock_irqsave(&domain->lock, flags);
1812 spin_lock(&dev_data->lock);
1815 * First check if the device is still attached. It might already
1816 * be detached from its domain because the generic
1817 * iommu_detach_group code detached it and we try again here in
1818 * our alias handling.
1820 if (WARN_ON(!dev_data->domain))
1823 do_detach(dev_data);
1825 if (!dev_is_pci(dev))
1828 if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
1829 pdev_iommuv2_disable(to_pci_dev(dev));
1830 else if (dev_data->ats.enabled)
1831 pci_disable_ats(to_pci_dev(dev));
1833 dev_data->ats.enabled = false;
1836 spin_unlock(&dev_data->lock);
1838 spin_unlock_irqrestore(&domain->lock, flags);
1841 static struct iommu_device *amd_iommu_probe_device(struct device *dev)
1843 struct iommu_device *iommu_dev;
1844 struct amd_iommu *iommu;
1847 if (!check_device(dev))
1848 return ERR_PTR(-ENODEV);
1850 iommu = rlookup_amd_iommu(dev);
1852 return ERR_PTR(-ENODEV);
1854 if (dev_iommu_priv_get(dev))
1855 return &iommu->iommu;
1857 ret = iommu_init_device(iommu, dev);
1859 if (ret != -ENOTSUPP)
1860 dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
1861 iommu_dev = ERR_PTR(ret);
1862 iommu_ignore_device(iommu, dev);
1864 amd_iommu_set_pci_msi_domain(dev, iommu);
1865 iommu_dev = &iommu->iommu;
1868 iommu_completion_wait(iommu);
1873 static void amd_iommu_probe_finalize(struct device *dev)
1875 /* Domains are initialized for this device - have a look what we ended up with */
1876 set_dma_ops(dev, NULL);
1877 iommu_setup_dma_ops(dev, 0, U64_MAX);
1880 static void amd_iommu_release_device(struct device *dev)
1882 struct amd_iommu *iommu;
1884 if (!check_device(dev))
1887 iommu = rlookup_amd_iommu(dev);
1891 amd_iommu_uninit_device(dev);
1892 iommu_completion_wait(iommu);
1895 static struct iommu_group *amd_iommu_device_group(struct device *dev)
1897 if (dev_is_pci(dev))
1898 return pci_device_group(dev);
1900 return acpihid_device_group(dev);
1903 /*****************************************************************************
1905 * The next functions belong to the dma_ops mapping/unmapping code.
1907 *****************************************************************************/
1909 static void update_device_table(struct protection_domain *domain)
1911 struct iommu_dev_data *dev_data;
1913 list_for_each_entry(dev_data, &domain->dev_list, list) {
1914 struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
1918 set_dte_entry(iommu, dev_data->devid, domain,
1919 dev_data->ats.enabled, dev_data->iommu_v2);
1920 clone_aliases(iommu, dev_data->dev);
1924 void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
1926 update_device_table(domain);
1927 domain_flush_devices(domain);
1930 void amd_iommu_domain_update(struct protection_domain *domain)
1932 /* Update device table */
1933 amd_iommu_update_and_flush_device_table(domain);
1935 /* Flush domain TLB(s) and wait for completion */
1936 amd_iommu_domain_flush_tlb_pde(domain);
1937 amd_iommu_domain_flush_complete(domain);
1940 int __init amd_iommu_init_api(void)
1944 err = bus_set_iommu(&pci_bus_type, &amd_iommu_ops);
1947 #ifdef CONFIG_ARM_AMBA
1948 err = bus_set_iommu(&amba_bustype, &amd_iommu_ops);
1952 err = bus_set_iommu(&platform_bus_type, &amd_iommu_ops);
1959 /*****************************************************************************
1961 * The following functions belong to the exported interface of AMD IOMMU
1963 * This interface allows access to lower level functions of the IOMMU
1964 * like protection domain handling and assignement of devices to domains
1965 * which is not possible with the dma_ops interface.
1967 *****************************************************************************/
1969 static void cleanup_domain(struct protection_domain *domain)
1971 struct iommu_dev_data *entry;
1972 unsigned long flags;
1974 spin_lock_irqsave(&domain->lock, flags);
1976 while (!list_empty(&domain->dev_list)) {
1977 entry = list_first_entry(&domain->dev_list,
1978 struct iommu_dev_data, list);
1979 BUG_ON(!entry->domain);
1983 spin_unlock_irqrestore(&domain->lock, flags);
1986 static void protection_domain_free(struct protection_domain *domain)
1992 domain_id_free(domain->id);
1994 if (domain->iop.pgtbl_cfg.tlb)
1995 free_io_pgtable_ops(&domain->iop.iop.ops);
2000 static int protection_domain_init_v1(struct protection_domain *domain, int mode)
2002 u64 *pt_root = NULL;
2004 BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL);
2006 spin_lock_init(&domain->lock);
2007 domain->id = domain_id_alloc();
2010 INIT_LIST_HEAD(&domain->dev_list);
2012 if (mode != PAGE_MODE_NONE) {
2013 pt_root = (void *)get_zeroed_page(GFP_KERNEL);
2018 amd_iommu_domain_set_pgtable(domain, pt_root, mode);
2023 static struct protection_domain *protection_domain_alloc(unsigned int type)
2025 struct io_pgtable_ops *pgtbl_ops;
2026 struct protection_domain *domain;
2027 int pgtable = amd_iommu_pgtable;
2028 int mode = DEFAULT_PGTABLE_LEVEL;
2031 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2036 * Force IOMMU v1 page table when iommu=pt and
2037 * when allocating domain for pass-through devices.
2039 if (type == IOMMU_DOMAIN_IDENTITY) {
2040 pgtable = AMD_IOMMU_V1;
2041 mode = PAGE_MODE_NONE;
2042 } else if (type == IOMMU_DOMAIN_UNMANAGED) {
2043 pgtable = AMD_IOMMU_V1;
2048 ret = protection_domain_init_v1(domain, mode);
2057 pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain);
2067 static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
2069 struct protection_domain *domain;
2072 * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
2073 * default to use IOMMU_DOMAIN_DMA[_FQ].
2075 if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
2078 domain = protection_domain_alloc(type);
2082 domain->domain.geometry.aperture_start = 0;
2083 domain->domain.geometry.aperture_end = ~0ULL;
2084 domain->domain.geometry.force_aperture = true;
2086 return &domain->domain;
2089 static void amd_iommu_domain_free(struct iommu_domain *dom)
2091 struct protection_domain *domain;
2093 domain = to_pdomain(dom);
2095 if (domain->dev_cnt > 0)
2096 cleanup_domain(domain);
2098 BUG_ON(domain->dev_cnt != 0);
2103 if (domain->flags & PD_IOMMUV2_MASK)
2104 free_gcr3_table(domain);
2106 protection_domain_free(domain);
2109 static void amd_iommu_detach_device(struct iommu_domain *dom,
2112 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2113 struct amd_iommu *iommu;
2115 if (!check_device(dev))
2118 if (dev_data->domain != NULL)
2121 iommu = rlookup_amd_iommu(dev);
2125 #ifdef CONFIG_IRQ_REMAP
2126 if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
2127 (dom->type == IOMMU_DOMAIN_UNMANAGED))
2128 dev_data->use_vapic = 0;
2131 iommu_completion_wait(iommu);
2134 static int amd_iommu_attach_device(struct iommu_domain *dom,
2137 struct protection_domain *domain = to_pdomain(dom);
2138 struct iommu_dev_data *dev_data;
2139 struct amd_iommu *iommu;
2142 if (!check_device(dev))
2145 dev_data = dev_iommu_priv_get(dev);
2146 dev_data->defer_attach = false;
2148 iommu = rlookup_amd_iommu(dev);
2152 if (dev_data->domain)
2155 ret = attach_device(dev, domain);
2157 #ifdef CONFIG_IRQ_REMAP
2158 if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
2159 if (dom->type == IOMMU_DOMAIN_UNMANAGED)
2160 dev_data->use_vapic = 1;
2162 dev_data->use_vapic = 0;
2166 iommu_completion_wait(iommu);
2171 static void amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
2172 unsigned long iova, size_t size)
2174 struct protection_domain *domain = to_pdomain(dom);
2175 struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2178 domain_flush_np_cache(domain, iova, size);
2181 static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
2182 phys_addr_t paddr, size_t page_size, int iommu_prot,
2185 struct protection_domain *domain = to_pdomain(dom);
2186 struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2190 if ((amd_iommu_pgtable == AMD_IOMMU_V1) &&
2191 (domain->iop.mode == PAGE_MODE_NONE))
2194 if (iommu_prot & IOMMU_READ)
2195 prot |= IOMMU_PROT_IR;
2196 if (iommu_prot & IOMMU_WRITE)
2197 prot |= IOMMU_PROT_IW;
2200 ret = ops->map(ops, iova, paddr, page_size, prot, gfp);
2205 static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
2206 struct iommu_iotlb_gather *gather,
2207 unsigned long iova, size_t size)
2210 * AMD's IOMMU can flush as many pages as necessary in a single flush.
2211 * Unless we run in a virtual machine, which can be inferred according
2212 * to whether "non-present cache" is on, it is probably best to prefer
2213 * (potentially) too extensive TLB flushing (i.e., more misses) over
2214 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
2215 * hypervisor needs to synchronize the host IOMMU PTEs with those of
2216 * the guest, and the trade-off is different: unnecessary TLB flushes
2217 * should be avoided.
2219 if (amd_iommu_np_cache &&
2220 iommu_iotlb_gather_is_disjoint(gather, iova, size))
2221 iommu_iotlb_sync(domain, gather);
2223 iommu_iotlb_gather_add_range(gather, iova, size);
2226 static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2228 struct iommu_iotlb_gather *gather)
2230 struct protection_domain *domain = to_pdomain(dom);
2231 struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2234 if ((amd_iommu_pgtable == AMD_IOMMU_V1) &&
2235 (domain->iop.mode == PAGE_MODE_NONE))
2238 r = (ops->unmap) ? ops->unmap(ops, iova, page_size, gather) : 0;
2240 amd_iommu_iotlb_gather_add_page(dom, gather, iova, page_size);
2245 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2248 struct protection_domain *domain = to_pdomain(dom);
2249 struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2251 return ops->iova_to_phys(ops, iova);
2254 static bool amd_iommu_capable(enum iommu_cap cap)
2257 case IOMMU_CAP_CACHE_COHERENCY:
2259 case IOMMU_CAP_INTR_REMAP:
2260 return (irq_remapping_enabled == 1);
2261 case IOMMU_CAP_NOEXEC:
2263 case IOMMU_CAP_PRE_BOOT_PROTECTION:
2264 return amdr_ivrs_remap_support;
2272 static void amd_iommu_get_resv_regions(struct device *dev,
2273 struct list_head *head)
2275 struct iommu_resv_region *region;
2276 struct unity_map_entry *entry;
2277 struct amd_iommu *iommu;
2278 struct amd_iommu_pci_seg *pci_seg;
2281 sbdf = get_device_sbdf_id(dev);
2285 devid = PCI_SBDF_TO_DEVID(sbdf);
2286 iommu = rlookup_amd_iommu(dev);
2289 pci_seg = iommu->pci_seg;
2291 list_for_each_entry(entry, &pci_seg->unity_map, list) {
2295 if (devid < entry->devid_start || devid > entry->devid_end)
2298 type = IOMMU_RESV_DIRECT;
2299 length = entry->address_end - entry->address_start;
2300 if (entry->prot & IOMMU_PROT_IR)
2302 if (entry->prot & IOMMU_PROT_IW)
2303 prot |= IOMMU_WRITE;
2304 if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
2305 /* Exclusion range */
2306 type = IOMMU_RESV_RESERVED;
2308 region = iommu_alloc_resv_region(entry->address_start,
2309 length, prot, type);
2311 dev_err(dev, "Out of memory allocating dm-regions\n");
2314 list_add_tail(®ion->list, head);
2317 region = iommu_alloc_resv_region(MSI_RANGE_START,
2318 MSI_RANGE_END - MSI_RANGE_START + 1,
2322 list_add_tail(®ion->list, head);
2324 region = iommu_alloc_resv_region(HT_RANGE_START,
2325 HT_RANGE_END - HT_RANGE_START + 1,
2326 0, IOMMU_RESV_RESERVED);
2329 list_add_tail(®ion->list, head);
2332 bool amd_iommu_is_attach_deferred(struct device *dev)
2334 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2336 return dev_data->defer_attach;
2338 EXPORT_SYMBOL_GPL(amd_iommu_is_attach_deferred);
2340 static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
2342 struct protection_domain *dom = to_pdomain(domain);
2343 unsigned long flags;
2345 spin_lock_irqsave(&dom->lock, flags);
2346 amd_iommu_domain_flush_tlb_pde(dom);
2347 amd_iommu_domain_flush_complete(dom);
2348 spin_unlock_irqrestore(&dom->lock, flags);
2351 static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
2352 struct iommu_iotlb_gather *gather)
2354 struct protection_domain *dom = to_pdomain(domain);
2355 unsigned long flags;
2357 spin_lock_irqsave(&dom->lock, flags);
2358 domain_flush_pages(dom, gather->start, gather->end - gather->start, 1);
2359 amd_iommu_domain_flush_complete(dom);
2360 spin_unlock_irqrestore(&dom->lock, flags);
2363 static int amd_iommu_def_domain_type(struct device *dev)
2365 struct iommu_dev_data *dev_data;
2367 dev_data = dev_iommu_priv_get(dev);
2372 * Do not identity map IOMMUv2 capable devices when memory encryption is
2373 * active, because some of those devices (AMD GPUs) don't have the
2374 * encryption bit in their DMA-mask and require remapping.
2376 if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT) && dev_data->iommu_v2)
2377 return IOMMU_DOMAIN_IDENTITY;
2382 static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
2384 /* IOMMU_PTE_FC is always set */
2388 const struct iommu_ops amd_iommu_ops = {
2389 .capable = amd_iommu_capable,
2390 .domain_alloc = amd_iommu_domain_alloc,
2391 .probe_device = amd_iommu_probe_device,
2392 .release_device = amd_iommu_release_device,
2393 .probe_finalize = amd_iommu_probe_finalize,
2394 .device_group = amd_iommu_device_group,
2395 .get_resv_regions = amd_iommu_get_resv_regions,
2396 .is_attach_deferred = amd_iommu_is_attach_deferred,
2397 .pgsize_bitmap = AMD_IOMMU_PGSIZES,
2398 .def_domain_type = amd_iommu_def_domain_type,
2399 .default_domain_ops = &(const struct iommu_domain_ops) {
2400 .attach_dev = amd_iommu_attach_device,
2401 .detach_dev = amd_iommu_detach_device,
2402 .map = amd_iommu_map,
2403 .unmap = amd_iommu_unmap,
2404 .iotlb_sync_map = amd_iommu_iotlb_sync_map,
2405 .iova_to_phys = amd_iommu_iova_to_phys,
2406 .flush_iotlb_all = amd_iommu_flush_iotlb_all,
2407 .iotlb_sync = amd_iommu_iotlb_sync,
2408 .free = amd_iommu_domain_free,
2409 .enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
2413 /*****************************************************************************
2415 * The next functions do a basic initialization of IOMMU for pass through
2418 * In passthrough mode the IOMMU is initialized and enabled but not used for
2419 * DMA-API translation.
2421 *****************************************************************************/
2423 /* IOMMUv2 specific functions */
2424 int amd_iommu_register_ppr_notifier(struct notifier_block *nb)
2426 return atomic_notifier_chain_register(&ppr_notifier, nb);
2428 EXPORT_SYMBOL(amd_iommu_register_ppr_notifier);
2430 int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb)
2432 return atomic_notifier_chain_unregister(&ppr_notifier, nb);
2434 EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
2436 void amd_iommu_domain_direct_map(struct iommu_domain *dom)
2438 struct protection_domain *domain = to_pdomain(dom);
2439 unsigned long flags;
2441 spin_lock_irqsave(&domain->lock, flags);
2443 if (domain->iop.pgtbl_cfg.tlb)
2444 free_io_pgtable_ops(&domain->iop.iop.ops);
2446 spin_unlock_irqrestore(&domain->lock, flags);
2448 EXPORT_SYMBOL(amd_iommu_domain_direct_map);
2450 int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids)
2452 struct protection_domain *domain = to_pdomain(dom);
2453 unsigned long flags;
2456 /* Number of GCR3 table levels required */
2457 for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9)
2460 if (levels > amd_iommu_max_glx_val)
2463 spin_lock_irqsave(&domain->lock, flags);
2466 * Save us all sanity checks whether devices already in the
2467 * domain support IOMMUv2. Just force that the domain has no
2468 * devices attached when it is switched into IOMMUv2 mode.
2471 if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK)
2475 domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC);
2476 if (domain->gcr3_tbl == NULL)
2479 domain->glx = levels;
2480 domain->flags |= PD_IOMMUV2_MASK;
2482 amd_iommu_domain_update(domain);
2487 spin_unlock_irqrestore(&domain->lock, flags);
2491 EXPORT_SYMBOL(amd_iommu_domain_enable_v2);
2493 static int __flush_pasid(struct protection_domain *domain, u32 pasid,
2494 u64 address, bool size)
2496 struct iommu_dev_data *dev_data;
2497 struct iommu_cmd cmd;
2500 if (!(domain->flags & PD_IOMMUV2_MASK))
2503 build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size);
2506 * IOMMU TLB needs to be flushed before Device TLB to
2507 * prevent device TLB refill from IOMMU TLB
2509 for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
2510 if (domain->dev_iommu[i] == 0)
2513 ret = iommu_queue_command(amd_iommus[i], &cmd);
2518 /* Wait until IOMMU TLB flushes are complete */
2519 amd_iommu_domain_flush_complete(domain);
2521 /* Now flush device TLBs */
2522 list_for_each_entry(dev_data, &domain->dev_list, list) {
2523 struct amd_iommu *iommu;
2527 There might be non-IOMMUv2 capable devices in an IOMMUv2
2530 if (!dev_data->ats.enabled)
2533 qdep = dev_data->ats.qdep;
2534 iommu = rlookup_amd_iommu(dev_data->dev);
2537 build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid,
2538 qdep, address, size);
2540 ret = iommu_queue_command(iommu, &cmd);
2545 /* Wait until all device TLBs are flushed */
2546 amd_iommu_domain_flush_complete(domain);
2555 static int __amd_iommu_flush_page(struct protection_domain *domain, u32 pasid,
2558 return __flush_pasid(domain, pasid, address, false);
2561 int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid,
2564 struct protection_domain *domain = to_pdomain(dom);
2565 unsigned long flags;
2568 spin_lock_irqsave(&domain->lock, flags);
2569 ret = __amd_iommu_flush_page(domain, pasid, address);
2570 spin_unlock_irqrestore(&domain->lock, flags);
2574 EXPORT_SYMBOL(amd_iommu_flush_page);
2576 static int __amd_iommu_flush_tlb(struct protection_domain *domain, u32 pasid)
2578 return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
2582 int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid)
2584 struct protection_domain *domain = to_pdomain(dom);
2585 unsigned long flags;
2588 spin_lock_irqsave(&domain->lock, flags);
2589 ret = __amd_iommu_flush_tlb(domain, pasid);
2590 spin_unlock_irqrestore(&domain->lock, flags);
2594 EXPORT_SYMBOL(amd_iommu_flush_tlb);
2596 static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc)
2603 index = (pasid >> (9 * level)) & 0x1ff;
2609 if (!(*pte & GCR3_VALID)) {
2613 root = (void *)get_zeroed_page(GFP_ATOMIC);
2617 *pte = iommu_virt_to_phys(root) | GCR3_VALID;
2620 root = iommu_phys_to_virt(*pte & PAGE_MASK);
2628 static int __set_gcr3(struct protection_domain *domain, u32 pasid,
2633 if (domain->iop.mode != PAGE_MODE_NONE)
2636 pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
2640 *pte = (cr3 & PAGE_MASK) | GCR3_VALID;
2642 return __amd_iommu_flush_tlb(domain, pasid);
2645 static int __clear_gcr3(struct protection_domain *domain, u32 pasid)
2649 if (domain->iop.mode != PAGE_MODE_NONE)
2652 pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
2658 return __amd_iommu_flush_tlb(domain, pasid);
2661 int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid,
2664 struct protection_domain *domain = to_pdomain(dom);
2665 unsigned long flags;
2668 spin_lock_irqsave(&domain->lock, flags);
2669 ret = __set_gcr3(domain, pasid, cr3);
2670 spin_unlock_irqrestore(&domain->lock, flags);
2674 EXPORT_SYMBOL(amd_iommu_domain_set_gcr3);
2676 int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, u32 pasid)
2678 struct protection_domain *domain = to_pdomain(dom);
2679 unsigned long flags;
2682 spin_lock_irqsave(&domain->lock, flags);
2683 ret = __clear_gcr3(domain, pasid);
2684 spin_unlock_irqrestore(&domain->lock, flags);
2688 EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3);
2690 int amd_iommu_complete_ppr(struct pci_dev *pdev, u32 pasid,
2691 int status, int tag)
2693 struct iommu_dev_data *dev_data;
2694 struct amd_iommu *iommu;
2695 struct iommu_cmd cmd;
2697 dev_data = dev_iommu_priv_get(&pdev->dev);
2698 iommu = rlookup_amd_iommu(&pdev->dev);
2702 build_complete_ppr(&cmd, dev_data->devid, pasid, status,
2703 tag, dev_data->pri_tlp);
2705 return iommu_queue_command(iommu, &cmd);
2707 EXPORT_SYMBOL(amd_iommu_complete_ppr);
2709 int amd_iommu_device_info(struct pci_dev *pdev,
2710 struct amd_iommu_device_info *info)
2715 if (pdev == NULL || info == NULL)
2718 if (!amd_iommu_v2_supported())
2721 memset(info, 0, sizeof(*info));
2723 if (pci_ats_supported(pdev))
2724 info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
2726 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI);
2728 info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
2730 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID);
2734 max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1));
2735 max_pasids = min(max_pasids, (1 << 20));
2737 info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
2738 info->max_pasids = min(pci_max_pasids(pdev), max_pasids);
2740 features = pci_pasid_features(pdev);
2741 if (features & PCI_PASID_CAP_EXEC)
2742 info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
2743 if (features & PCI_PASID_CAP_PRIV)
2744 info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
2749 EXPORT_SYMBOL(amd_iommu_device_info);
2751 #ifdef CONFIG_IRQ_REMAP
2753 /*****************************************************************************
2755 * Interrupt Remapping Implementation
2757 *****************************************************************************/
2759 static struct irq_chip amd_ir_chip;
2760 static DEFINE_SPINLOCK(iommu_table_lock);
2762 static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
2763 struct irq_remap_table *table)
2766 struct dev_table_entry *dev_table = get_dev_table(iommu);
2768 dte = dev_table[devid].data[2];
2769 dte &= ~DTE_IRQ_PHYS_ADDR_MASK;
2770 dte |= iommu_virt_to_phys(table->table);
2771 dte |= DTE_IRQ_REMAP_INTCTL;
2772 dte |= DTE_INTTABLEN;
2773 dte |= DTE_IRQ_REMAP_ENABLE;
2775 dev_table[devid].data[2] = dte;
2778 static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
2780 struct irq_remap_table *table;
2781 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2783 if (WARN_ONCE(!pci_seg->rlookup_table[devid],
2784 "%s: no iommu for devid %x:%x\n",
2785 __func__, pci_seg->id, devid))
2788 table = pci_seg->irq_lookup_table[devid];
2789 if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n",
2790 __func__, pci_seg->id, devid))
2796 static struct irq_remap_table *__alloc_irq_table(void)
2798 struct irq_remap_table *table;
2800 table = kzalloc(sizeof(*table), GFP_KERNEL);
2804 table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
2805 if (!table->table) {
2809 raw_spin_lock_init(&table->lock);
2811 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
2812 memset(table->table, 0,
2813 MAX_IRQS_PER_TABLE * sizeof(u32));
2815 memset(table->table, 0,
2816 (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
2820 static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
2821 struct irq_remap_table *table)
2823 struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2825 pci_seg->irq_lookup_table[devid] = table;
2826 set_dte_irq_entry(iommu, devid, table);
2827 iommu_flush_dte(iommu, devid);
2830 static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
2833 struct irq_remap_table *table = data;
2834 struct amd_iommu_pci_seg *pci_seg;
2835 struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev);
2840 pci_seg = iommu->pci_seg;
2841 pci_seg->irq_lookup_table[alias] = table;
2842 set_dte_irq_entry(iommu, alias, table);
2843 iommu_flush_dte(pci_seg->rlookup_table[alias], alias);
2848 static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
2849 u16 devid, struct pci_dev *pdev)
2851 struct irq_remap_table *table = NULL;
2852 struct irq_remap_table *new_table = NULL;
2853 struct amd_iommu_pci_seg *pci_seg;
2854 unsigned long flags;
2857 spin_lock_irqsave(&iommu_table_lock, flags);
2859 pci_seg = iommu->pci_seg;
2860 table = pci_seg->irq_lookup_table[devid];
2864 alias = pci_seg->alias_table[devid];
2865 table = pci_seg->irq_lookup_table[alias];
2867 set_remap_table_entry(iommu, devid, table);
2870 spin_unlock_irqrestore(&iommu_table_lock, flags);
2872 /* Nothing there yet, allocate new irq remapping table */
2873 new_table = __alloc_irq_table();
2877 spin_lock_irqsave(&iommu_table_lock, flags);
2879 table = pci_seg->irq_lookup_table[devid];
2883 table = pci_seg->irq_lookup_table[alias];
2885 set_remap_table_entry(iommu, devid, table);
2893 pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
2896 set_remap_table_entry(iommu, devid, table);
2899 set_remap_table_entry(iommu, alias, table);
2902 iommu_completion_wait(iommu);
2905 spin_unlock_irqrestore(&iommu_table_lock, flags);
2908 kmem_cache_free(amd_iommu_irq_cache, new_table->table);
2914 static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
2915 bool align, struct pci_dev *pdev)
2917 struct irq_remap_table *table;
2918 int index, c, alignment = 1;
2919 unsigned long flags;
2921 table = alloc_irq_table(iommu, devid, pdev);
2926 alignment = roundup_pow_of_two(count);
2928 raw_spin_lock_irqsave(&table->lock, flags);
2930 /* Scan table for free entries */
2931 for (index = ALIGN(table->min_index, alignment), c = 0;
2932 index < MAX_IRQS_PER_TABLE;) {
2933 if (!iommu->irte_ops->is_allocated(table, index)) {
2937 index = ALIGN(index + 1, alignment);
2943 iommu->irte_ops->set_allocated(table, index - c + 1);
2955 raw_spin_unlock_irqrestore(&table->lock, flags);
2960 static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
2961 struct irte_ga *irte, struct amd_ir_data *data)
2964 struct irq_remap_table *table;
2965 unsigned long flags;
2966 struct irte_ga *entry;
2968 table = get_irq_table(iommu, devid);
2972 raw_spin_lock_irqsave(&table->lock, flags);
2974 entry = (struct irte_ga *)table->table;
2975 entry = &entry[index];
2977 ret = cmpxchg_double(&entry->lo.val, &entry->hi.val,
2978 entry->lo.val, entry->hi.val,
2979 irte->lo.val, irte->hi.val);
2981 * We use cmpxchg16 to atomically update the 128-bit IRTE,
2982 * and it cannot be updated by the hardware or other processors
2983 * behind us, so the return value of cmpxchg16 should be the
2984 * same as the old value.
2991 raw_spin_unlock_irqrestore(&table->lock, flags);
2993 iommu_flush_irt(iommu, devid);
2994 iommu_completion_wait(iommu);
2999 static int modify_irte(struct amd_iommu *iommu,
3000 u16 devid, int index, union irte *irte)
3002 struct irq_remap_table *table;
3003 unsigned long flags;
3005 table = get_irq_table(iommu, devid);
3009 raw_spin_lock_irqsave(&table->lock, flags);
3010 table->table[index] = irte->val;
3011 raw_spin_unlock_irqrestore(&table->lock, flags);
3013 iommu_flush_irt(iommu, devid);
3014 iommu_completion_wait(iommu);
3019 static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
3021 struct irq_remap_table *table;
3022 unsigned long flags;
3024 table = get_irq_table(iommu, devid);
3028 raw_spin_lock_irqsave(&table->lock, flags);
3029 iommu->irte_ops->clear_allocated(table, index);
3030 raw_spin_unlock_irqrestore(&table->lock, flags);
3032 iommu_flush_irt(iommu, devid);
3033 iommu_completion_wait(iommu);
3036 static void irte_prepare(void *entry,
3037 u32 delivery_mode, bool dest_mode,
3038 u8 vector, u32 dest_apicid, int devid)
3040 union irte *irte = (union irte *) entry;
3043 irte->fields.vector = vector;
3044 irte->fields.int_type = delivery_mode;
3045 irte->fields.destination = dest_apicid;
3046 irte->fields.dm = dest_mode;
3047 irte->fields.valid = 1;
3050 static void irte_ga_prepare(void *entry,
3051 u32 delivery_mode, bool dest_mode,
3052 u8 vector, u32 dest_apicid, int devid)
3054 struct irte_ga *irte = (struct irte_ga *) entry;
3058 irte->lo.fields_remap.int_type = delivery_mode;
3059 irte->lo.fields_remap.dm = dest_mode;
3060 irte->hi.fields.vector = vector;
3061 irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
3062 irte->hi.fields.destination = APICID_TO_IRTE_DEST_HI(dest_apicid);
3063 irte->lo.fields_remap.valid = 1;
3066 static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3068 union irte *irte = (union irte *) entry;
3070 irte->fields.valid = 1;
3071 modify_irte(iommu, devid, index, irte);
3074 static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3076 struct irte_ga *irte = (struct irte_ga *) entry;
3078 irte->lo.fields_remap.valid = 1;
3079 modify_irte_ga(iommu, devid, index, irte, NULL);
3082 static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3084 union irte *irte = (union irte *) entry;
3086 irte->fields.valid = 0;
3087 modify_irte(iommu, devid, index, irte);
3090 static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3092 struct irte_ga *irte = (struct irte_ga *) entry;
3094 irte->lo.fields_remap.valid = 0;
3095 modify_irte_ga(iommu, devid, index, irte, NULL);
3098 static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3099 u8 vector, u32 dest_apicid)
3101 union irte *irte = (union irte *) entry;
3103 irte->fields.vector = vector;
3104 irte->fields.destination = dest_apicid;
3105 modify_irte(iommu, devid, index, irte);
3108 static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3109 u8 vector, u32 dest_apicid)
3111 struct irte_ga *irte = (struct irte_ga *) entry;
3113 if (!irte->lo.fields_remap.guest_mode) {
3114 irte->hi.fields.vector = vector;
3115 irte->lo.fields_remap.destination =
3116 APICID_TO_IRTE_DEST_LO(dest_apicid);
3117 irte->hi.fields.destination =
3118 APICID_TO_IRTE_DEST_HI(dest_apicid);
3119 modify_irte_ga(iommu, devid, index, irte, NULL);
3123 #define IRTE_ALLOCATED (~1U)
3124 static void irte_set_allocated(struct irq_remap_table *table, int index)
3126 table->table[index] = IRTE_ALLOCATED;
3129 static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
3131 struct irte_ga *ptr = (struct irte_ga *)table->table;
3132 struct irte_ga *irte = &ptr[index];
3134 memset(&irte->lo.val, 0, sizeof(u64));
3135 memset(&irte->hi.val, 0, sizeof(u64));
3136 irte->hi.fields.vector = 0xff;
3139 static bool irte_is_allocated(struct irq_remap_table *table, int index)
3141 union irte *ptr = (union irte *)table->table;
3142 union irte *irte = &ptr[index];
3144 return irte->val != 0;
3147 static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
3149 struct irte_ga *ptr = (struct irte_ga *)table->table;
3150 struct irte_ga *irte = &ptr[index];
3152 return irte->hi.fields.vector != 0;
3155 static void irte_clear_allocated(struct irq_remap_table *table, int index)
3157 table->table[index] = 0;
3160 static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
3162 struct irte_ga *ptr = (struct irte_ga *)table->table;
3163 struct irte_ga *irte = &ptr[index];
3165 memset(&irte->lo.val, 0, sizeof(u64));
3166 memset(&irte->hi.val, 0, sizeof(u64));
3169 static int get_devid(struct irq_alloc_info *info)
3171 switch (info->type) {
3172 case X86_IRQ_ALLOC_TYPE_IOAPIC:
3173 return get_ioapic_devid(info->devid);
3174 case X86_IRQ_ALLOC_TYPE_HPET:
3175 return get_hpet_devid(info->devid);
3176 case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3177 case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3178 return get_device_sbdf_id(msi_desc_to_dev(info->desc));
3185 struct irq_remap_ops amd_iommu_irq_ops = {
3186 .prepare = amd_iommu_prepare,
3187 .enable = amd_iommu_enable,
3188 .disable = amd_iommu_disable,
3189 .reenable = amd_iommu_reenable,
3190 .enable_faulting = amd_iommu_enable_faulting,
3193 static void fill_msi_msg(struct msi_msg *msg, u32 index)
3196 msg->address_lo = 0;
3197 msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
3198 msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
3201 static void irq_remapping_prepare_irte(struct amd_ir_data *data,
3202 struct irq_cfg *irq_cfg,
3203 struct irq_alloc_info *info,
3204 int devid, int index, int sub_handle)
3206 struct irq_2_irte *irte_info = &data->irq_2_irte;
3207 struct amd_iommu *iommu = data->iommu;
3212 data->irq_2_irte.devid = devid;
3213 data->irq_2_irte.index = index + sub_handle;
3214 iommu->irte_ops->prepare(data->entry, apic->delivery_mode,
3215 apic->dest_mode_logical, irq_cfg->vector,
3216 irq_cfg->dest_apicid, devid);
3218 switch (info->type) {
3219 case X86_IRQ_ALLOC_TYPE_IOAPIC:
3220 case X86_IRQ_ALLOC_TYPE_HPET:
3221 case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3222 case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3223 fill_msi_msg(&data->msi_entry, irte_info->index);
3232 struct amd_irte_ops irte_32_ops = {
3233 .prepare = irte_prepare,
3234 .activate = irte_activate,
3235 .deactivate = irte_deactivate,
3236 .set_affinity = irte_set_affinity,
3237 .set_allocated = irte_set_allocated,
3238 .is_allocated = irte_is_allocated,
3239 .clear_allocated = irte_clear_allocated,
3242 struct amd_irte_ops irte_128_ops = {
3243 .prepare = irte_ga_prepare,
3244 .activate = irte_ga_activate,
3245 .deactivate = irte_ga_deactivate,
3246 .set_affinity = irte_ga_set_affinity,
3247 .set_allocated = irte_ga_set_allocated,
3248 .is_allocated = irte_ga_is_allocated,
3249 .clear_allocated = irte_ga_clear_allocated,
3252 static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
3253 unsigned int nr_irqs, void *arg)
3255 struct irq_alloc_info *info = arg;
3256 struct irq_data *irq_data;
3257 struct amd_ir_data *data = NULL;
3258 struct amd_iommu *iommu;
3259 struct irq_cfg *cfg;
3260 int i, ret, devid, seg, sbdf;
3265 if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI &&
3266 info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX)
3270 * With IRQ remapping enabled, don't need contiguous CPU vectors
3271 * to support multiple MSI interrupts.
3273 if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
3274 info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
3276 sbdf = get_devid(info);
3280 seg = PCI_SBDF_TO_SEGID(sbdf);
3281 devid = PCI_SBDF_TO_DEVID(sbdf);
3282 iommu = __rlookup_amd_iommu(seg, devid);
3286 ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
3290 if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
3291 struct irq_remap_table *table;
3293 table = alloc_irq_table(iommu, devid, NULL);
3295 if (!table->min_index) {
3297 * Keep the first 32 indexes free for IOAPIC
3300 table->min_index = 32;
3301 for (i = 0; i < 32; ++i)
3302 iommu->irte_ops->set_allocated(table, i);
3304 WARN_ON(table->min_index != 32);
3305 index = info->ioapic.pin;
3309 } else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
3310 info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
3311 bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
3313 index = alloc_irq_index(iommu, devid, nr_irqs, align,
3314 msi_desc_to_pci_dev(info->desc));
3316 index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL);
3320 pr_warn("Failed to allocate IRTE\n");
3322 goto out_free_parent;
3325 for (i = 0; i < nr_irqs; i++) {
3326 irq_data = irq_domain_get_irq_data(domain, virq + i);
3327 cfg = irq_data ? irqd_cfg(irq_data) : NULL;
3334 data = kzalloc(sizeof(*data), GFP_KERNEL);
3338 if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
3339 data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
3341 data->entry = kzalloc(sizeof(struct irte_ga),
3348 data->iommu = iommu;
3349 irq_data->hwirq = (devid << 16) + i;
3350 irq_data->chip_data = data;
3351 irq_data->chip = &amd_ir_chip;
3352 irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
3353 irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
3359 for (i--; i >= 0; i--) {
3360 irq_data = irq_domain_get_irq_data(domain, virq + i);
3362 kfree(irq_data->chip_data);
3364 for (i = 0; i < nr_irqs; i++)
3365 free_irte(iommu, devid, index + i);
3367 irq_domain_free_irqs_common(domain, virq, nr_irqs);
3371 static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
3372 unsigned int nr_irqs)
3374 struct irq_2_irte *irte_info;
3375 struct irq_data *irq_data;
3376 struct amd_ir_data *data;
3379 for (i = 0; i < nr_irqs; i++) {
3380 irq_data = irq_domain_get_irq_data(domain, virq + i);
3381 if (irq_data && irq_data->chip_data) {
3382 data = irq_data->chip_data;
3383 irte_info = &data->irq_2_irte;
3384 free_irte(data->iommu, irte_info->devid, irte_info->index);
3389 irq_domain_free_irqs_common(domain, virq, nr_irqs);
3392 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3393 struct amd_ir_data *ir_data,
3394 struct irq_2_irte *irte_info,
3395 struct irq_cfg *cfg);
3397 static int irq_remapping_activate(struct irq_domain *domain,
3398 struct irq_data *irq_data, bool reserve)
3400 struct amd_ir_data *data = irq_data->chip_data;
3401 struct irq_2_irte *irte_info = &data->irq_2_irte;
3402 struct amd_iommu *iommu = data->iommu;
3403 struct irq_cfg *cfg = irqd_cfg(irq_data);
3408 iommu->irte_ops->activate(iommu, data->entry, irte_info->devid,
3410 amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
3414 static void irq_remapping_deactivate(struct irq_domain *domain,
3415 struct irq_data *irq_data)
3417 struct amd_ir_data *data = irq_data->chip_data;
3418 struct irq_2_irte *irte_info = &data->irq_2_irte;
3419 struct amd_iommu *iommu = data->iommu;
3422 iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid,
3426 static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
3427 enum irq_domain_bus_token bus_token)
3429 struct amd_iommu *iommu;
3432 if (!amd_iommu_irq_remap)
3435 if (x86_fwspec_is_ioapic(fwspec))
3436 devid = get_ioapic_devid(fwspec->param[0]);
3437 else if (x86_fwspec_is_hpet(fwspec))
3438 devid = get_hpet_devid(fwspec->param[0]);
3442 iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff));
3444 return iommu && iommu->ir_domain == d;
3447 static const struct irq_domain_ops amd_ir_domain_ops = {
3448 .select = irq_remapping_select,
3449 .alloc = irq_remapping_alloc,
3450 .free = irq_remapping_free,
3451 .activate = irq_remapping_activate,
3452 .deactivate = irq_remapping_deactivate,
3455 int amd_iommu_activate_guest_mode(void *data)
3457 struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3458 struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3461 if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3462 !entry || entry->lo.fields_vapic.guest_mode)
3465 valid = entry->lo.fields_vapic.valid;
3470 entry->lo.fields_vapic.valid = valid;
3471 entry->lo.fields_vapic.guest_mode = 1;
3472 entry->lo.fields_vapic.ga_log_intr = 1;
3473 entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr;
3474 entry->hi.fields.vector = ir_data->ga_vector;
3475 entry->lo.fields_vapic.ga_tag = ir_data->ga_tag;
3477 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3478 ir_data->irq_2_irte.index, entry, ir_data);
3480 EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
3482 int amd_iommu_deactivate_guest_mode(void *data)
3484 struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3485 struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3486 struct irq_cfg *cfg = ir_data->cfg;
3489 if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3490 !entry || !entry->lo.fields_vapic.guest_mode)
3493 valid = entry->lo.fields_remap.valid;
3498 entry->lo.fields_remap.valid = valid;
3499 entry->lo.fields_remap.dm = apic->dest_mode_logical;
3500 entry->lo.fields_remap.int_type = apic->delivery_mode;
3501 entry->hi.fields.vector = cfg->vector;
3502 entry->lo.fields_remap.destination =
3503 APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
3504 entry->hi.fields.destination =
3505 APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
3507 return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3508 ir_data->irq_2_irte.index, entry, ir_data);
3510 EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
3512 static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
3515 struct amd_iommu_pi_data *pi_data = vcpu_info;
3516 struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
3517 struct amd_ir_data *ir_data = data->chip_data;
3518 struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3519 struct iommu_dev_data *dev_data;
3521 if (ir_data->iommu == NULL)
3524 dev_data = search_dev_data(ir_data->iommu, irte_info->devid);
3527 * This device has never been set up for guest mode.
3528 * we should not modify the IRTE
3530 if (!dev_data || !dev_data->use_vapic)
3533 ir_data->cfg = irqd_cfg(data);
3534 pi_data->ir_data = ir_data;
3537 * SVM tries to set up for VAPIC mode, but we are in
3538 * legacy mode. So, we force legacy mode instead.
3540 if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
3541 pr_debug("%s: Fall back to using intr legacy remap\n",
3543 pi_data->is_guest_mode = false;
3546 pi_data->prev_ga_tag = ir_data->cached_ga_tag;
3547 if (pi_data->is_guest_mode) {
3548 ir_data->ga_root_ptr = (pi_data->base >> 12);
3549 ir_data->ga_vector = vcpu_pi_info->vector;
3550 ir_data->ga_tag = pi_data->ga_tag;
3551 ret = amd_iommu_activate_guest_mode(ir_data);
3553 ir_data->cached_ga_tag = pi_data->ga_tag;
3555 ret = amd_iommu_deactivate_guest_mode(ir_data);
3558 * This communicates the ga_tag back to the caller
3559 * so that it can do all the necessary clean up.
3562 ir_data->cached_ga_tag = 0;
3569 static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3570 struct amd_ir_data *ir_data,
3571 struct irq_2_irte *irte_info,
3572 struct irq_cfg *cfg)
3576 * Atomically updates the IRTE with the new destination, vector
3577 * and flushes the interrupt entry cache.
3579 iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid,
3580 irte_info->index, cfg->vector,
3584 static int amd_ir_set_affinity(struct irq_data *data,
3585 const struct cpumask *mask, bool force)
3587 struct amd_ir_data *ir_data = data->chip_data;
3588 struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3589 struct irq_cfg *cfg = irqd_cfg(data);
3590 struct irq_data *parent = data->parent_data;
3591 struct amd_iommu *iommu = ir_data->iommu;
3597 ret = parent->chip->irq_set_affinity(parent, mask, force);
3598 if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
3601 amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
3603 * After this point, all the interrupts will start arriving
3604 * at the new destination. So, time to cleanup the previous
3605 * vector allocation.
3607 send_cleanup_vector(cfg);
3609 return IRQ_SET_MASK_OK_DONE;
3612 static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
3614 struct amd_ir_data *ir_data = irq_data->chip_data;
3616 *msg = ir_data->msi_entry;
3619 static struct irq_chip amd_ir_chip = {
3621 .irq_ack = apic_ack_irq,
3622 .irq_set_affinity = amd_ir_set_affinity,
3623 .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity,
3624 .irq_compose_msi_msg = ir_compose_msi_msg,
3627 int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
3629 struct fwnode_handle *fn;
3631 fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
3634 iommu->ir_domain = irq_domain_create_tree(fn, &amd_ir_domain_ops, iommu);
3635 if (!iommu->ir_domain) {
3636 irq_domain_free_fwnode(fn);
3640 iommu->ir_domain->parent = arch_get_ir_parent_domain();
3641 iommu->msi_domain = arch_create_remap_msi_irq_domain(iommu->ir_domain,
3647 int amd_iommu_update_ga(int cpu, bool is_run, void *data)
3649 unsigned long flags;
3650 struct amd_iommu *iommu;
3651 struct irq_remap_table *table;
3652 struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3653 int devid = ir_data->irq_2_irte.devid;
3654 struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3655 struct irte_ga *ref = (struct irte_ga *) ir_data->ref;
3657 if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3658 !ref || !entry || !entry->lo.fields_vapic.guest_mode)
3661 iommu = ir_data->iommu;
3665 table = get_irq_table(iommu, devid);
3669 raw_spin_lock_irqsave(&table->lock, flags);
3671 if (ref->lo.fields_vapic.guest_mode) {
3673 ref->lo.fields_vapic.destination =
3674 APICID_TO_IRTE_DEST_LO(cpu);
3675 ref->hi.fields.destination =
3676 APICID_TO_IRTE_DEST_HI(cpu);
3678 ref->lo.fields_vapic.is_run = is_run;
3682 raw_spin_unlock_irqrestore(&table->lock, flags);
3684 iommu_flush_irt(iommu, devid);
3685 iommu_completion_wait(iommu);
3688 EXPORT_SYMBOL(amd_iommu_update_ga);