1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
31 #include "cap_audit.h"
34 #define ROOT_SIZE VTD_PAGE_SIZE
35 #define CONTEXT_SIZE VTD_PAGE_SIZE
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 #define IOAPIC_RANGE_START (0xfee00000)
43 #define IOAPIC_RANGE_END (0xfeefffff)
44 #define IOVA_START_ADDR (0x1000)
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
54 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 /* IO virtual address start page frame number */
58 #define IOVA_START_PFN (1)
60 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
62 static void __init check_tylersburg_isoch(void);
63 static int rwbf_quirk;
66 * set to 1 to panic kernel if can't successfully enable VT-d
67 * (used when kernel is launched w/ TXT)
69 static int force_on = 0;
70 static int intel_iommu_tboot_noforce;
71 static int no_platform_optin;
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
76 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
79 static phys_addr_t root_entry_lctp(struct root_entry *re)
84 return re->lo & VTD_PAGE_MASK;
88 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
91 static phys_addr_t root_entry_uctp(struct root_entry *re)
96 return re->hi & VTD_PAGE_MASK;
99 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
101 struct device_domain_info *info =
102 rb_entry(node, struct device_domain_info, node);
103 const u16 *rid_lhs = key;
105 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
108 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
114 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
116 struct device_domain_info *info =
117 rb_entry(lhs, struct device_domain_info, node);
118 u16 key = PCI_DEVID(info->bus, info->devfn);
120 return device_rid_cmp_key(&key, rhs);
124 * Looks up an IOMMU-probed device using its source ID.
126 * Returns the pointer to the device if there is a match. Otherwise,
129 * Note that this helper doesn't guarantee that the device won't be
130 * released by the iommu subsystem after being returned. The caller
131 * should use its own synchronization mechanism to avoid the device
132 * being released during its use if its possibly the case.
134 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
136 struct device_domain_info *info = NULL;
137 struct rb_node *node;
140 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
141 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
143 info = rb_entry(node, struct device_domain_info, node);
144 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
146 return info ? info->dev : NULL;
149 static int device_rbtree_insert(struct intel_iommu *iommu,
150 struct device_domain_info *info)
152 struct rb_node *curr;
155 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
156 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
157 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
164 static void device_rbtree_remove(struct device_domain_info *info)
166 struct intel_iommu *iommu = info->iommu;
169 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
170 rb_erase(&info->node, &iommu->device_rbtree);
171 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
175 * This domain is a statically identity mapping domain.
176 * 1. This domain creats a static 1:1 mapping to all usable memory.
177 * 2. It maps to each iommu if successful.
178 * 3. Each iommu mapps to this domain if successful.
180 static struct dmar_domain *si_domain;
181 static int hw_pass_through = 1;
183 struct dmar_rmrr_unit {
184 struct list_head list; /* list of rmrr units */
185 struct acpi_dmar_header *hdr; /* ACPI header */
186 u64 base_address; /* reserved base address*/
187 u64 end_address; /* reserved end address */
188 struct dmar_dev_scope *devices; /* target devices */
189 int devices_cnt; /* target device count */
192 struct dmar_atsr_unit {
193 struct list_head list; /* list of ATSR units */
194 struct acpi_dmar_header *hdr; /* ACPI header */
195 struct dmar_dev_scope *devices; /* target devices */
196 int devices_cnt; /* target device count */
197 u8 include_all:1; /* include all ports */
200 struct dmar_satc_unit {
201 struct list_head list; /* list of SATC units */
202 struct acpi_dmar_header *hdr; /* ACPI header */
203 struct dmar_dev_scope *devices; /* target devices */
204 struct intel_iommu *iommu; /* the corresponding iommu */
205 int devices_cnt; /* target device count */
206 u8 atc_required:1; /* ATS is required */
209 static LIST_HEAD(dmar_atsr_units);
210 static LIST_HEAD(dmar_rmrr_units);
211 static LIST_HEAD(dmar_satc_units);
213 #define for_each_rmrr_units(rmrr) \
214 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
216 static void intel_iommu_domain_free(struct iommu_domain *domain);
218 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
219 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
221 int intel_iommu_enabled = 0;
222 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
224 static int dmar_map_gfx = 1;
225 static int intel_iommu_superpage = 1;
226 static int iommu_identity_mapping;
227 static int iommu_skip_te_disable;
229 #define IDENTMAP_GFX 2
230 #define IDENTMAP_AZALIA 4
232 const struct iommu_ops intel_iommu_ops;
233 static const struct iommu_dirty_ops intel_dirty_ops;
235 static bool translation_pre_enabled(struct intel_iommu *iommu)
237 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
240 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
242 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
245 static void init_translation_status(struct intel_iommu *iommu)
249 gsts = readl(iommu->reg + DMAR_GSTS_REG);
250 if (gsts & DMA_GSTS_TES)
251 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
254 static int __init intel_iommu_setup(char *str)
260 if (!strncmp(str, "on", 2)) {
262 pr_info("IOMMU enabled\n");
263 } else if (!strncmp(str, "off", 3)) {
265 no_platform_optin = 1;
266 pr_info("IOMMU disabled\n");
267 } else if (!strncmp(str, "igfx_off", 8)) {
269 pr_info("Disable GFX device mapping\n");
270 } else if (!strncmp(str, "forcedac", 8)) {
271 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
272 iommu_dma_forcedac = true;
273 } else if (!strncmp(str, "strict", 6)) {
274 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
275 iommu_set_dma_strict();
276 } else if (!strncmp(str, "sp_off", 6)) {
277 pr_info("Disable supported super page\n");
278 intel_iommu_superpage = 0;
279 } else if (!strncmp(str, "sm_on", 5)) {
280 pr_info("Enable scalable mode if hardware supports\n");
282 } else if (!strncmp(str, "sm_off", 6)) {
283 pr_info("Scalable mode is disallowed\n");
285 } else if (!strncmp(str, "tboot_noforce", 13)) {
286 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
287 intel_iommu_tboot_noforce = 1;
289 pr_notice("Unknown option - '%s'\n", str);
292 str += strcspn(str, ",");
299 __setup("intel_iommu=", intel_iommu_setup);
301 void *alloc_pgtable_page(int node, gfp_t gfp)
306 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
308 vaddr = page_address(page);
312 void free_pgtable_page(void *vaddr)
314 free_page((unsigned long)vaddr);
317 static int domain_type_is_si(struct dmar_domain *domain)
319 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
322 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
324 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
326 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
330 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
331 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
332 * the returned SAGAW.
334 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
336 unsigned long fl_sagaw, sl_sagaw;
338 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
339 sl_sagaw = cap_sagaw(iommu->cap);
341 /* Second level only. */
342 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
345 /* First level only. */
346 if (!ecap_slts(iommu->ecap))
349 return fl_sagaw & sl_sagaw;
352 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
357 sagaw = __iommu_calculate_sagaw(iommu);
358 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
359 if (test_bit(agaw, &sagaw))
367 * Calculate max SAGAW for each iommu.
369 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
371 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
375 * calculate agaw for each iommu.
376 * "SAGAW" may be different across iommus, use a default agaw, and
377 * get a supported less agaw for iommus that don't support the default agaw.
379 int iommu_calculate_agaw(struct intel_iommu *iommu)
381 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
384 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
386 return sm_supported(iommu) ?
387 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
390 static void domain_update_iommu_coherency(struct dmar_domain *domain)
392 struct iommu_domain_info *info;
393 struct dmar_drhd_unit *drhd;
394 struct intel_iommu *iommu;
398 domain->iommu_coherency = true;
399 xa_for_each(&domain->iommu_array, i, info) {
401 if (!iommu_paging_structure_coherency(info->iommu)) {
402 domain->iommu_coherency = false;
409 /* No hardware attached; use lowest common denominator */
411 for_each_active_iommu(iommu, drhd) {
412 if (!iommu_paging_structure_coherency(iommu)) {
413 domain->iommu_coherency = false;
420 static int domain_update_iommu_superpage(struct dmar_domain *domain,
421 struct intel_iommu *skip)
423 struct dmar_drhd_unit *drhd;
424 struct intel_iommu *iommu;
427 if (!intel_iommu_superpage)
430 /* set iommu_superpage to the smallest common denominator */
432 for_each_active_iommu(iommu, drhd) {
434 if (domain && domain->use_first_level) {
435 if (!cap_fl1gp_support(iommu->cap))
438 mask &= cap_super_page_val(iommu->cap);
450 static int domain_update_device_node(struct dmar_domain *domain)
452 struct device_domain_info *info;
453 int nid = NUMA_NO_NODE;
456 spin_lock_irqsave(&domain->lock, flags);
457 list_for_each_entry(info, &domain->devices, link) {
459 * There could possibly be multiple device numa nodes as devices
460 * within the same domain may sit behind different IOMMUs. There
461 * isn't perfect answer in such situation, so we select first
462 * come first served policy.
464 nid = dev_to_node(info->dev);
465 if (nid != NUMA_NO_NODE)
468 spin_unlock_irqrestore(&domain->lock, flags);
473 /* Return the super pagesize bitmap if supported. */
474 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
476 unsigned long bitmap = 0;
479 * 1-level super page supports page size of 2MiB, 2-level super page
480 * supports page size of both 2MiB and 1GiB.
482 if (domain->iommu_superpage == 1)
484 else if (domain->iommu_superpage == 2)
485 bitmap |= SZ_2M | SZ_1G;
490 /* Some capabilities may be different across iommus */
491 void domain_update_iommu_cap(struct dmar_domain *domain)
493 domain_update_iommu_coherency(domain);
494 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
497 * If RHSA is missing, we should default to the device numa domain
500 if (domain->nid == NUMA_NO_NODE)
501 domain->nid = domain_update_device_node(domain);
504 * First-level translation restricts the input-address to a
505 * canonical address (i.e., address bits 63:N have the same
506 * value as address bit [N-1], where N is 48-bits with 4-level
507 * paging and 57-bits with 5-level paging). Hence, skip bit
510 if (domain->use_first_level)
511 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
513 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
515 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
516 domain_update_iotlb(domain);
519 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
522 struct root_entry *root = &iommu->root_entry[bus];
523 struct context_entry *context;
527 * Except that the caller requested to allocate a new entry,
528 * returning a copied context entry makes no sense.
530 if (!alloc && context_copied(iommu, bus, devfn))
534 if (sm_supported(iommu)) {
542 context = phys_to_virt(*entry & VTD_PAGE_MASK);
544 unsigned long phy_addr;
548 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
552 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
553 phy_addr = virt_to_phys((void *)context);
554 *entry = phy_addr | 1;
555 __iommu_flush_cache(iommu, entry, sizeof(*entry));
557 return &context[devfn];
561 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
562 * sub-hierarchy of a candidate PCI-PCI bridge
563 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
564 * @bridge: the candidate PCI-PCI bridge
566 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
569 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
571 struct pci_dev *pdev, *pbridge;
573 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
576 pdev = to_pci_dev(dev);
577 pbridge = to_pci_dev(bridge);
579 if (pbridge->subordinate &&
580 pbridge->subordinate->number <= pdev->bus->number &&
581 pbridge->subordinate->busn_res.end >= pdev->bus->number)
587 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
589 struct dmar_drhd_unit *drhd;
593 /* We know that this device on this chipset has its own IOMMU.
594 * If we find it under a different IOMMU, then the BIOS is lying
595 * to us. Hope that the IOMMU for this device is actually
596 * disabled, and it needs no translation...
598 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
601 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
606 /* we know that the this iommu should be at offset 0xa000 from vtbar */
607 drhd = dmar_find_matched_drhd_unit(pdev);
608 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
609 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
610 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
617 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
619 if (!iommu || iommu->drhd->ignored)
622 if (dev_is_pci(dev)) {
623 struct pci_dev *pdev = to_pci_dev(dev);
625 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
626 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
627 quirk_ioat_snb_local_iommu(pdev))
634 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
636 struct dmar_drhd_unit *drhd = NULL;
637 struct pci_dev *pdev = NULL;
638 struct intel_iommu *iommu;
646 if (dev_is_pci(dev)) {
647 struct pci_dev *pf_pdev;
649 pdev = pci_real_dma_dev(to_pci_dev(dev));
651 /* VFs aren't listed in scope tables; we need to look up
652 * the PF instead to find the IOMMU. */
653 pf_pdev = pci_physfn(pdev);
655 segment = pci_domain_nr(pdev->bus);
656 } else if (has_acpi_companion(dev))
657 dev = &ACPI_COMPANION(dev)->dev;
660 for_each_iommu(iommu, drhd) {
661 if (pdev && segment != drhd->segment)
664 for_each_active_dev_scope(drhd->devices,
665 drhd->devices_cnt, i, tmp) {
667 /* For a VF use its original BDF# not that of the PF
668 * which we used for the IOMMU lookup. Strictly speaking
669 * we could do this for all PCI devices; we only need to
670 * get the BDF# from the scope table for ACPI matches. */
671 if (pdev && pdev->is_virtfn)
675 *bus = drhd->devices[i].bus;
676 *devfn = drhd->devices[i].devfn;
681 if (is_downstream_to_pci_bridge(dev, tmp))
685 if (pdev && drhd->include_all) {
688 *bus = pdev->bus->number;
689 *devfn = pdev->devfn;
696 if (iommu_is_dummy(iommu, dev))
704 static void domain_flush_cache(struct dmar_domain *domain,
705 void *addr, int size)
707 if (!domain->iommu_coherency)
708 clflush_cache_range(addr, size);
711 static void free_context_table(struct intel_iommu *iommu)
713 struct context_entry *context;
716 if (!iommu->root_entry)
719 for (i = 0; i < ROOT_ENTRY_NR; i++) {
720 context = iommu_context_addr(iommu, i, 0, 0);
722 free_pgtable_page(context);
724 if (!sm_supported(iommu))
727 context = iommu_context_addr(iommu, i, 0x80, 0);
729 free_pgtable_page(context);
732 free_pgtable_page(iommu->root_entry);
733 iommu->root_entry = NULL;
736 #ifdef CONFIG_DMAR_DEBUG
737 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
738 u8 bus, u8 devfn, struct dma_pte *parent, int level)
744 offset = pfn_level_offset(pfn, level);
745 pte = &parent[offset];
746 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
747 pr_info("PTE not present at level %d\n", level);
751 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
756 parent = phys_to_virt(dma_pte_addr(pte));
761 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
762 unsigned long long addr, u32 pasid)
764 struct pasid_dir_entry *dir, *pde;
765 struct pasid_entry *entries, *pte;
766 struct context_entry *ctx_entry;
767 struct root_entry *rt_entry;
768 int i, dir_index, index, level;
769 u8 devfn = source_id & 0xff;
770 u8 bus = source_id >> 8;
771 struct dma_pte *pgtable;
773 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
775 /* root entry dump */
776 rt_entry = &iommu->root_entry[bus];
778 pr_info("root table entry is not present\n");
782 if (sm_supported(iommu))
783 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
784 rt_entry->hi, rt_entry->lo);
786 pr_info("root entry: 0x%016llx", rt_entry->lo);
788 /* context entry dump */
789 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
791 pr_info("context table entry is not present\n");
795 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
796 ctx_entry->hi, ctx_entry->lo);
798 /* legacy mode does not require PASID entries */
799 if (!sm_supported(iommu)) {
800 level = agaw_to_level(ctx_entry->hi & 7);
801 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
805 /* get the pointer to pasid directory entry */
806 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
808 pr_info("pasid directory entry is not present\n");
811 /* For request-without-pasid, get the pasid from context entry */
812 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
813 pasid = IOMMU_NO_PASID;
815 dir_index = pasid >> PASID_PDE_SHIFT;
816 pde = &dir[dir_index];
817 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
819 /* get the pointer to the pasid table entry */
820 entries = get_pasid_table_from_pde(pde);
822 pr_info("pasid table entry is not present\n");
825 index = pasid & PASID_PTE_MASK;
826 pte = &entries[index];
827 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
828 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
830 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
831 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
832 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
834 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
835 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
839 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
843 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
844 unsigned long pfn, int *target_level,
847 struct dma_pte *parent, *pte;
848 int level = agaw_to_level(domain->agaw);
851 if (!domain_pfn_supported(domain, pfn))
852 /* Address beyond IOMMU's addressing capabilities. */
855 parent = domain->pgd;
860 offset = pfn_level_offset(pfn, level);
861 pte = &parent[offset];
862 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
864 if (level == *target_level)
867 if (!dma_pte_present(pte)) {
870 tmp_page = alloc_pgtable_page(domain->nid, gfp);
875 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
876 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
877 if (domain->use_first_level)
878 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
880 if (cmpxchg64(&pte->val, 0ULL, pteval))
881 /* Someone else set it while we were thinking; use theirs. */
882 free_pgtable_page(tmp_page);
884 domain_flush_cache(domain, pte, sizeof(*pte));
889 parent = phys_to_virt(dma_pte_addr(pte));
894 *target_level = level;
899 /* return address's pte at specific level */
900 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
902 int level, int *large_page)
904 struct dma_pte *parent, *pte;
905 int total = agaw_to_level(domain->agaw);
908 parent = domain->pgd;
909 while (level <= total) {
910 offset = pfn_level_offset(pfn, total);
911 pte = &parent[offset];
915 if (!dma_pte_present(pte)) {
920 if (dma_pte_superpage(pte)) {
925 parent = phys_to_virt(dma_pte_addr(pte));
931 /* clear last level pte, a tlb flush should be followed */
932 static void dma_pte_clear_range(struct dmar_domain *domain,
933 unsigned long start_pfn,
934 unsigned long last_pfn)
936 unsigned int large_page;
937 struct dma_pte *first_pte, *pte;
939 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
940 WARN_ON(start_pfn > last_pfn))
943 /* we don't need lock here; nobody else touches the iova range */
946 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
948 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
953 start_pfn += lvl_to_nr_pages(large_page);
955 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
957 domain_flush_cache(domain, first_pte,
958 (void *)pte - (void *)first_pte);
960 } while (start_pfn && start_pfn <= last_pfn);
963 static void dma_pte_free_level(struct dmar_domain *domain, int level,
964 int retain_level, struct dma_pte *pte,
965 unsigned long pfn, unsigned long start_pfn,
966 unsigned long last_pfn)
968 pfn = max(start_pfn, pfn);
969 pte = &pte[pfn_level_offset(pfn, level)];
972 unsigned long level_pfn;
973 struct dma_pte *level_pte;
975 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
978 level_pfn = pfn & level_mask(level);
979 level_pte = phys_to_virt(dma_pte_addr(pte));
982 dma_pte_free_level(domain, level - 1, retain_level,
983 level_pte, level_pfn, start_pfn,
988 * Free the page table if we're below the level we want to
989 * retain and the range covers the entire table.
991 if (level < retain_level && !(start_pfn > level_pfn ||
992 last_pfn < level_pfn + level_size(level) - 1)) {
994 domain_flush_cache(domain, pte, sizeof(*pte));
995 free_pgtable_page(level_pte);
998 pfn += level_size(level);
999 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1003 * clear last level (leaf) ptes and free page table pages below the
1004 * level we wish to keep intact.
1006 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1007 unsigned long start_pfn,
1008 unsigned long last_pfn,
1011 dma_pte_clear_range(domain, start_pfn, last_pfn);
1013 /* We don't need lock here; nobody else touches the iova range */
1014 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1015 domain->pgd, 0, start_pfn, last_pfn);
1018 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1019 free_pgtable_page(domain->pgd);
1024 /* When a page at a given level is being unlinked from its parent, we don't
1025 need to *modify* it at all. All we need to do is make a list of all the
1026 pages which can be freed just as soon as we've flushed the IOTLB and we
1027 know the hardware page-walk will no longer touch them.
1028 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1030 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1031 int level, struct dma_pte *pte,
1032 struct list_head *freelist)
1036 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1037 list_add_tail(&pg->lru, freelist);
1042 pte = page_address(pg);
1044 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1045 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1047 } while (!first_pte_in_page(pte));
1050 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1051 struct dma_pte *pte, unsigned long pfn,
1052 unsigned long start_pfn, unsigned long last_pfn,
1053 struct list_head *freelist)
1055 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1057 pfn = max(start_pfn, pfn);
1058 pte = &pte[pfn_level_offset(pfn, level)];
1061 unsigned long level_pfn = pfn & level_mask(level);
1063 if (!dma_pte_present(pte))
1066 /* If range covers entire pagetable, free it */
1067 if (start_pfn <= level_pfn &&
1068 last_pfn >= level_pfn + level_size(level) - 1) {
1069 /* These suborbinate page tables are going away entirely. Don't
1070 bother to clear them; we're just going to *free* them. */
1071 if (level > 1 && !dma_pte_superpage(pte))
1072 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1078 } else if (level > 1) {
1079 /* Recurse down into a level that isn't *entirely* obsolete */
1080 dma_pte_clear_level(domain, level - 1,
1081 phys_to_virt(dma_pte_addr(pte)),
1082 level_pfn, start_pfn, last_pfn,
1086 pfn = level_pfn + level_size(level);
1087 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1090 domain_flush_cache(domain, first_pte,
1091 (void *)++last_pte - (void *)first_pte);
1094 /* We can't just free the pages because the IOMMU may still be walking
1095 the page tables, and may have cached the intermediate levels. The
1096 pages can only be freed after the IOTLB flush has been done. */
1097 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1098 unsigned long last_pfn, struct list_head *freelist)
1100 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1101 WARN_ON(start_pfn > last_pfn))
1104 /* we don't need lock here; nobody else touches the iova range */
1105 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1106 domain->pgd, 0, start_pfn, last_pfn, freelist);
1109 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1110 struct page *pgd_page = virt_to_page(domain->pgd);
1111 list_add_tail(&pgd_page->lru, freelist);
1116 /* iommu handling */
1117 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1119 struct root_entry *root;
1121 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1123 pr_err("Allocating root entry for %s failed\n",
1128 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1129 iommu->root_entry = root;
1134 static void iommu_set_root_entry(struct intel_iommu *iommu)
1140 addr = virt_to_phys(iommu->root_entry);
1141 if (sm_supported(iommu))
1142 addr |= DMA_RTADDR_SMT;
1144 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1145 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1147 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1149 /* Make sure hardware complete it */
1150 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1151 readl, (sts & DMA_GSTS_RTPS), sts);
1153 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1156 * Hardware invalidates all DMA remapping hardware translation
1157 * caches as part of SRTP flow.
1159 if (cap_esrtps(iommu->cap))
1162 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1163 if (sm_supported(iommu))
1164 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1165 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1168 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1173 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1176 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1177 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1179 /* Make sure hardware complete it */
1180 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1181 readl, (!(val & DMA_GSTS_WBFS)), val);
1183 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1186 /* return value determine if we need a write buffer flush */
1187 static void __iommu_flush_context(struct intel_iommu *iommu,
1188 u16 did, u16 source_id, u8 function_mask,
1195 case DMA_CCMD_GLOBAL_INVL:
1196 val = DMA_CCMD_GLOBAL_INVL;
1198 case DMA_CCMD_DOMAIN_INVL:
1199 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1201 case DMA_CCMD_DEVICE_INVL:
1202 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1203 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1206 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1210 val |= DMA_CCMD_ICC;
1212 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1215 /* Make sure hardware complete it */
1216 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1217 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1219 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222 /* return value determine if we need a write buffer flush */
1223 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1224 u64 addr, unsigned int size_order, u64 type)
1226 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1227 u64 val = 0, val_iva = 0;
1231 case DMA_TLB_GLOBAL_FLUSH:
1232 /* global flush doesn't need set IVA_REG */
1233 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1235 case DMA_TLB_DSI_FLUSH:
1236 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1238 case DMA_TLB_PSI_FLUSH:
1239 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1240 /* IH bit is passed in as part of address */
1241 val_iva = size_order | addr;
1244 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1249 if (cap_write_drain(iommu->cap))
1250 val |= DMA_TLB_WRITE_DRAIN;
1252 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253 /* Note: Only uses first TLB reg currently */
1255 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1256 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1258 /* Make sure hardware complete it */
1259 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1260 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1262 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1264 /* check IOTLB invalidation granularity */
1265 if (DMA_TLB_IAIG(val) == 0)
1266 pr_err("Flush IOTLB failed\n");
1267 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1268 pr_debug("TLB flush request %Lx, actual %Lx\n",
1269 (unsigned long long)DMA_TLB_IIRG(type),
1270 (unsigned long long)DMA_TLB_IAIG(val));
1273 static struct device_domain_info *
1274 domain_lookup_dev_info(struct dmar_domain *domain,
1275 struct intel_iommu *iommu, u8 bus, u8 devfn)
1277 struct device_domain_info *info;
1278 unsigned long flags;
1280 spin_lock_irqsave(&domain->lock, flags);
1281 list_for_each_entry(info, &domain->devices, link) {
1282 if (info->iommu == iommu && info->bus == bus &&
1283 info->devfn == devfn) {
1284 spin_unlock_irqrestore(&domain->lock, flags);
1288 spin_unlock_irqrestore(&domain->lock, flags);
1293 void domain_update_iotlb(struct dmar_domain *domain)
1295 struct dev_pasid_info *dev_pasid;
1296 struct device_domain_info *info;
1297 bool has_iotlb_device = false;
1298 unsigned long flags;
1300 spin_lock_irqsave(&domain->lock, flags);
1301 list_for_each_entry(info, &domain->devices, link) {
1302 if (info->ats_enabled) {
1303 has_iotlb_device = true;
1308 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1309 info = dev_iommu_priv_get(dev_pasid->dev);
1310 if (info->ats_enabled) {
1311 has_iotlb_device = true;
1315 domain->has_iotlb_device = has_iotlb_device;
1316 spin_unlock_irqrestore(&domain->lock, flags);
1320 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1321 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1322 * check because it applies only to the built-in QAT devices and it doesn't
1323 * grant additional privileges.
1325 #define BUGGY_QAT_DEVID_MASK 0x4940
1326 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1328 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1331 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1337 static void iommu_enable_pci_caps(struct device_domain_info *info)
1339 struct pci_dev *pdev;
1341 if (!dev_is_pci(info->dev))
1344 pdev = to_pci_dev(info->dev);
1346 /* The PCIe spec, in its wisdom, declares that the behaviour of
1347 the device if you enable PASID support after ATS support is
1348 undefined. So always enable PASID support on devices which
1349 have it, even if we can't yet know if we're ever going to
1351 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1352 info->pasid_enabled = 1;
1354 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1355 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1356 info->ats_enabled = 1;
1357 domain_update_iotlb(info->domain);
1361 static void iommu_disable_pci_caps(struct device_domain_info *info)
1363 struct pci_dev *pdev;
1365 if (!dev_is_pci(info->dev))
1368 pdev = to_pci_dev(info->dev);
1370 if (info->ats_enabled) {
1371 pci_disable_ats(pdev);
1372 info->ats_enabled = 0;
1373 domain_update_iotlb(info->domain);
1376 if (info->pasid_enabled) {
1377 pci_disable_pasid(pdev);
1378 info->pasid_enabled = 0;
1382 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1383 u64 addr, unsigned int mask)
1387 if (!info || !info->ats_enabled)
1390 sid = info->bus << 8 | info->devfn;
1391 qdep = info->ats_qdep;
1392 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1394 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1397 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1398 u64 addr, unsigned mask)
1400 struct dev_pasid_info *dev_pasid;
1401 struct device_domain_info *info;
1402 unsigned long flags;
1404 if (!domain->has_iotlb_device)
1407 spin_lock_irqsave(&domain->lock, flags);
1408 list_for_each_entry(info, &domain->devices, link)
1409 __iommu_flush_dev_iotlb(info, addr, mask);
1411 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1412 info = dev_iommu_priv_get(dev_pasid->dev);
1414 if (!info->ats_enabled)
1417 qi_flush_dev_iotlb_pasid(info->iommu,
1418 PCI_DEVID(info->bus, info->devfn),
1419 info->pfsid, dev_pasid->pasid,
1420 info->ats_qdep, addr,
1423 spin_unlock_irqrestore(&domain->lock, flags);
1426 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1427 struct dmar_domain *domain, u64 addr,
1428 unsigned long npages, bool ih)
1430 u16 did = domain_id_iommu(domain, iommu);
1431 struct dev_pasid_info *dev_pasid;
1432 unsigned long flags;
1434 spin_lock_irqsave(&domain->lock, flags);
1435 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1436 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1438 if (!list_empty(&domain->devices))
1439 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1440 spin_unlock_irqrestore(&domain->lock, flags);
1443 static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1444 unsigned long pfn, unsigned int pages,
1447 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1448 unsigned long bitmask = aligned_pages - 1;
1449 unsigned int mask = ilog2(aligned_pages);
1450 u64 addr = (u64)pfn << VTD_PAGE_SHIFT;
1453 * PSI masks the low order bits of the base address. If the
1454 * address isn't aligned to the mask, then compute a mask value
1455 * needed to ensure the target range is flushed.
1457 if (unlikely(bitmask & pfn)) {
1458 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1461 * Since end_pfn <= pfn + bitmask, the only way bits
1462 * higher than bitmask can differ in pfn and end_pfn is
1463 * by carrying. This means after masking out bitmask,
1464 * high bits starting with the first set bit in
1465 * shared_bits are all equal in both pfn and end_pfn.
1467 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1468 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1472 * Fallback to domain selective flush if no PSI support or
1473 * the size is too big.
1475 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1476 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1479 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1483 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1484 struct dmar_domain *domain,
1485 unsigned long pfn, unsigned int pages,
1488 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1489 unsigned int mask = ilog2(aligned_pages);
1490 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1491 u16 did = domain_id_iommu(domain, iommu);
1493 if (WARN_ON(!pages))
1499 if (domain->use_first_level)
1500 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1502 __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih);
1505 * In caching mode, changes of pages from non-present to present require
1506 * flush. However, device IOTLB doesn't need to be flushed in this case.
1508 if (!cap_caching_mode(iommu->cap) || !map)
1509 iommu_flush_dev_iotlb(domain, addr, mask);
1512 /* Notification for newly created mappings */
1513 static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1514 unsigned long pfn, unsigned int pages)
1517 * It's a non-present to present mapping. Only flush if caching mode
1520 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1521 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1523 iommu_flush_write_buffer(iommu);
1527 * Flush the relevant caches in nested translation if the domain
1528 * also serves as a parent
1530 static void parent_domain_flush(struct dmar_domain *domain,
1532 unsigned long pages, int ih)
1534 struct dmar_domain *s1_domain;
1536 spin_lock(&domain->s1_lock);
1537 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
1538 struct device_domain_info *device_info;
1539 struct iommu_domain_info *info;
1540 unsigned long flags;
1543 xa_for_each(&s1_domain->iommu_array, i, info)
1544 __iommu_flush_iotlb_psi(info->iommu, info->did,
1547 if (!s1_domain->has_iotlb_device)
1550 spin_lock_irqsave(&s1_domain->lock, flags);
1551 list_for_each_entry(device_info, &s1_domain->devices, link)
1553 * Address translation cache in device side caches the
1554 * result of nested translation. There is no easy way
1555 * to identify the exact set of nested translations
1556 * affected by a change in S2. So just flush the entire
1559 __iommu_flush_dev_iotlb(device_info, 0,
1560 MAX_AGAW_PFN_WIDTH);
1561 spin_unlock_irqrestore(&s1_domain->lock, flags);
1563 spin_unlock(&domain->s1_lock);
1566 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1568 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1569 struct iommu_domain_info *info;
1572 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1573 struct intel_iommu *iommu = info->iommu;
1574 u16 did = domain_id_iommu(dmar_domain, iommu);
1576 if (dmar_domain->use_first_level)
1577 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1579 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1582 if (!cap_caching_mode(iommu->cap))
1583 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1586 if (dmar_domain->nested_parent)
1587 parent_domain_flush(dmar_domain, 0, -1, 0);
1590 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1593 unsigned long flags;
1595 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1598 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1599 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1600 pmen &= ~DMA_PMEN_EPM;
1601 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1603 /* wait for the protected region status bit to clear */
1604 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1605 readl, !(pmen & DMA_PMEN_PRS), pmen);
1607 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1610 static void iommu_enable_translation(struct intel_iommu *iommu)
1613 unsigned long flags;
1615 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1616 iommu->gcmd |= DMA_GCMD_TE;
1617 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1619 /* Make sure hardware complete it */
1620 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1621 readl, (sts & DMA_GSTS_TES), sts);
1623 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1626 static void iommu_disable_translation(struct intel_iommu *iommu)
1631 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1632 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1635 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1636 iommu->gcmd &= ~DMA_GCMD_TE;
1637 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1639 /* Make sure hardware complete it */
1640 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1641 readl, (!(sts & DMA_GSTS_TES)), sts);
1643 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1646 static int iommu_init_domains(struct intel_iommu *iommu)
1650 ndomains = cap_ndoms(iommu->cap);
1651 pr_debug("%s: Number of Domains supported <%d>\n",
1652 iommu->name, ndomains);
1654 spin_lock_init(&iommu->lock);
1656 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1657 if (!iommu->domain_ids)
1661 * If Caching mode is set, then invalid translations are tagged
1662 * with domain-id 0, hence we need to pre-allocate it. We also
1663 * use domain-id 0 as a marker for non-allocated domain-id, so
1664 * make sure it is not used for a real domain.
1666 set_bit(0, iommu->domain_ids);
1669 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1670 * entry for first-level or pass-through translation modes should
1671 * be programmed with a domain id different from those used for
1672 * second-level or nested translation. We reserve a domain id for
1675 if (sm_supported(iommu))
1676 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1681 static void disable_dmar_iommu(struct intel_iommu *iommu)
1683 if (!iommu->domain_ids)
1687 * All iommu domains must have been detached from the devices,
1688 * hence there should be no domain IDs in use.
1690 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1691 > NUM_RESERVED_DID))
1694 if (iommu->gcmd & DMA_GCMD_TE)
1695 iommu_disable_translation(iommu);
1698 static void free_dmar_iommu(struct intel_iommu *iommu)
1700 if (iommu->domain_ids) {
1701 bitmap_free(iommu->domain_ids);
1702 iommu->domain_ids = NULL;
1705 if (iommu->copied_tables) {
1706 bitmap_free(iommu->copied_tables);
1707 iommu->copied_tables = NULL;
1710 /* free context mapping */
1711 free_context_table(iommu);
1713 #ifdef CONFIG_INTEL_IOMMU_SVM
1714 if (pasid_supported(iommu)) {
1715 if (ecap_prs(iommu->ecap))
1716 intel_svm_finish_prq(iommu);
1722 * Check and return whether first level is used by default for
1725 static bool first_level_by_default(unsigned int type)
1727 /* Only SL is available in legacy mode */
1728 if (!scalable_mode_support())
1731 /* Only level (either FL or SL) is available, just use it */
1732 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1733 return intel_cap_flts_sanity();
1735 /* Both levels are available, decide it based on domain type */
1736 return type != IOMMU_DOMAIN_UNMANAGED;
1739 static struct dmar_domain *alloc_domain(unsigned int type)
1741 struct dmar_domain *domain;
1743 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1747 domain->nid = NUMA_NO_NODE;
1748 if (first_level_by_default(type))
1749 domain->use_first_level = true;
1750 domain->has_iotlb_device = false;
1751 INIT_LIST_HEAD(&domain->devices);
1752 INIT_LIST_HEAD(&domain->dev_pasids);
1753 spin_lock_init(&domain->lock);
1754 xa_init(&domain->iommu_array);
1759 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1761 struct iommu_domain_info *info, *curr;
1762 unsigned long ndomains;
1763 int num, ret = -ENOSPC;
1765 info = kzalloc(sizeof(*info), GFP_KERNEL);
1769 spin_lock(&iommu->lock);
1770 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1773 spin_unlock(&iommu->lock);
1778 ndomains = cap_ndoms(iommu->cap);
1779 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1780 if (num >= ndomains) {
1781 pr_err("%s: No free domain ids\n", iommu->name);
1785 set_bit(num, iommu->domain_ids);
1788 info->iommu = iommu;
1789 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1790 NULL, info, GFP_ATOMIC);
1792 ret = xa_err(curr) ? : -EBUSY;
1795 domain_update_iommu_cap(domain);
1797 spin_unlock(&iommu->lock);
1801 clear_bit(info->did, iommu->domain_ids);
1803 spin_unlock(&iommu->lock);
1808 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1810 struct iommu_domain_info *info;
1812 spin_lock(&iommu->lock);
1813 info = xa_load(&domain->iommu_array, iommu->seq_id);
1814 if (--info->refcnt == 0) {
1815 clear_bit(info->did, iommu->domain_ids);
1816 xa_erase(&domain->iommu_array, iommu->seq_id);
1817 domain->nid = NUMA_NO_NODE;
1818 domain_update_iommu_cap(domain);
1821 spin_unlock(&iommu->lock);
1824 static int guestwidth_to_adjustwidth(int gaw)
1827 int r = (gaw - 12) % 9;
1838 static void domain_exit(struct dmar_domain *domain)
1841 LIST_HEAD(freelist);
1843 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1844 put_pages_list(&freelist);
1847 if (WARN_ON(!list_empty(&domain->devices)))
1853 static int domain_context_mapping_one(struct dmar_domain *domain,
1854 struct intel_iommu *iommu,
1857 struct device_domain_info *info =
1858 domain_lookup_dev_info(domain, iommu, bus, devfn);
1859 u16 did = domain_id_iommu(domain, iommu);
1860 int translation = CONTEXT_TT_MULTI_LEVEL;
1861 struct dma_pte *pgd = domain->pgd;
1862 struct context_entry *context;
1865 if (hw_pass_through && domain_type_is_si(domain))
1866 translation = CONTEXT_TT_PASS_THROUGH;
1868 pr_debug("Set context mapping for %02x:%02x.%d\n",
1869 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1871 spin_lock(&iommu->lock);
1873 context = iommu_context_addr(iommu, bus, devfn, 1);
1878 if (context_present(context) && !context_copied(iommu, bus, devfn))
1882 * For kdump cases, old valid entries may be cached due to the
1883 * in-flight DMA and copied pgtable, but there is no unmapping
1884 * behaviour for them, thus we need an explicit cache flush for
1885 * the newly-mapped device. For kdump, at this point, the device
1886 * is supposed to finish reset at its driver probe stage, so no
1887 * in-flight DMA will exist, and we don't need to worry anymore
1890 if (context_copied(iommu, bus, devfn)) {
1891 u16 did_old = context_domain_id(context);
1893 if (did_old < cap_ndoms(iommu->cap)) {
1894 iommu->flush.flush_context(iommu, did_old,
1895 (((u16)bus) << 8) | devfn,
1896 DMA_CCMD_MASK_NOBIT,
1897 DMA_CCMD_DEVICE_INVL);
1898 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1902 clear_context_copied(iommu, bus, devfn);
1905 context_clear_entry(context);
1906 context_set_domain_id(context, did);
1908 if (translation != CONTEXT_TT_PASS_THROUGH) {
1910 * Skip top levels of page tables for iommu which has
1911 * less agaw than default. Unnecessary for PT mode.
1913 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1915 pgd = phys_to_virt(dma_pte_addr(pgd));
1916 if (!dma_pte_present(pgd))
1920 if (info && info->ats_supported)
1921 translation = CONTEXT_TT_DEV_IOTLB;
1923 translation = CONTEXT_TT_MULTI_LEVEL;
1925 context_set_address_root(context, virt_to_phys(pgd));
1926 context_set_address_width(context, agaw);
1929 * In pass through mode, AW must be programmed to
1930 * indicate the largest AGAW value supported by
1931 * hardware. And ASR is ignored by hardware.
1933 context_set_address_width(context, iommu->msagaw);
1936 context_set_translation_type(context, translation);
1937 context_set_fault_enable(context);
1938 context_set_present(context);
1939 if (!ecap_coherent(iommu->ecap))
1940 clflush_cache_range(context, sizeof(*context));
1943 * It's a non-present to present mapping. If hardware doesn't cache
1944 * non-present entry we only need to flush the write-buffer. If the
1945 * _does_ cache non-present entries, then it does so in the special
1946 * domain #0, which we have to flush:
1948 if (cap_caching_mode(iommu->cap)) {
1949 iommu->flush.flush_context(iommu, 0,
1950 (((u16)bus) << 8) | devfn,
1951 DMA_CCMD_MASK_NOBIT,
1952 DMA_CCMD_DEVICE_INVL);
1953 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1955 iommu_flush_write_buffer(iommu);
1961 spin_unlock(&iommu->lock);
1966 static int domain_context_mapping_cb(struct pci_dev *pdev,
1967 u16 alias, void *opaque)
1969 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1970 struct intel_iommu *iommu = info->iommu;
1971 struct dmar_domain *domain = opaque;
1973 return domain_context_mapping_one(domain, iommu,
1974 PCI_BUS_NUM(alias), alias & 0xff);
1978 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1980 struct device_domain_info *info = dev_iommu_priv_get(dev);
1981 struct intel_iommu *iommu = info->iommu;
1982 u8 bus = info->bus, devfn = info->devfn;
1984 if (!dev_is_pci(dev))
1985 return domain_context_mapping_one(domain, iommu, bus, devfn);
1987 return pci_for_each_dma_alias(to_pci_dev(dev),
1988 domain_context_mapping_cb, domain);
1991 /* Returns a number of VTD pages, but aligned to MM page size */
1992 static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1994 host_addr &= ~PAGE_MASK;
1995 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1998 /* Return largest possible superpage level for a given mapping */
1999 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
2000 unsigned long phy_pfn, unsigned long pages)
2002 int support, level = 1;
2003 unsigned long pfnmerge;
2005 support = domain->iommu_superpage;
2007 /* To use a large page, the virtual *and* physical addresses
2008 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2009 of them will mean we have to use smaller pages. So just
2010 merge them and check both at once. */
2011 pfnmerge = iov_pfn | phy_pfn;
2013 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2014 pages >>= VTD_STRIDE_SHIFT;
2017 pfnmerge >>= VTD_STRIDE_SHIFT;
2025 * Ensure that old small page tables are removed to make room for superpage(s).
2026 * We're going to add new large pages, so make sure we don't remove their parent
2027 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2029 static void switch_to_super_page(struct dmar_domain *domain,
2030 unsigned long start_pfn,
2031 unsigned long end_pfn, int level)
2033 unsigned long lvl_pages = lvl_to_nr_pages(level);
2034 struct iommu_domain_info *info;
2035 struct dma_pte *pte = NULL;
2038 while (start_pfn <= end_pfn) {
2040 pte = pfn_to_dma_pte(domain, start_pfn, &level,
2043 if (dma_pte_present(pte)) {
2044 dma_pte_free_pagetable(domain, start_pfn,
2045 start_pfn + lvl_pages - 1,
2048 xa_for_each(&domain->iommu_array, i, info)
2049 iommu_flush_iotlb_psi(info->iommu, domain,
2050 start_pfn, lvl_pages,
2052 if (domain->nested_parent)
2053 parent_domain_flush(domain, start_pfn,
2058 start_pfn += lvl_pages;
2059 if (first_pte_in_page(pte))
2065 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2066 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2069 struct dma_pte *first_pte = NULL, *pte = NULL;
2070 unsigned int largepage_lvl = 0;
2071 unsigned long lvl_pages = 0;
2075 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2078 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2081 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2082 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2086 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2087 attr |= DMA_FL_PTE_PRESENT;
2088 if (domain->use_first_level) {
2089 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2090 if (prot & DMA_PTE_WRITE)
2091 attr |= DMA_FL_PTE_DIRTY;
2094 domain->has_mappings = true;
2096 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2098 while (nr_pages > 0) {
2102 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2103 phys_pfn, nr_pages);
2105 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2111 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2113 /* It is large page*/
2114 if (largepage_lvl > 1) {
2115 unsigned long end_pfn;
2116 unsigned long pages_to_remove;
2118 pteval |= DMA_PTE_LARGE_PAGE;
2119 pages_to_remove = min_t(unsigned long, nr_pages,
2120 nr_pte_to_next_page(pte) * lvl_pages);
2121 end_pfn = iov_pfn + pages_to_remove - 1;
2122 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2124 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2128 /* We don't need lock here, nobody else
2129 * touches the iova range
2131 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2133 static int dumps = 5;
2134 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2135 iov_pfn, tmp, (unsigned long long)pteval);
2138 debug_dma_dump_mappings(NULL);
2143 nr_pages -= lvl_pages;
2144 iov_pfn += lvl_pages;
2145 phys_pfn += lvl_pages;
2146 pteval += lvl_pages * VTD_PAGE_SIZE;
2148 /* If the next PTE would be the first in a new page, then we
2149 * need to flush the cache on the entries we've just written.
2150 * And then we'll need to recalculate 'pte', so clear it and
2151 * let it get set again in the if (!pte) block above.
2153 * If we're done (!nr_pages) we need to flush the cache too.
2155 * Also if we've been setting superpages, we may need to
2156 * recalculate 'pte' and switch back to smaller pages for the
2157 * end of the mapping, if the trailing size is not enough to
2158 * use another superpage (i.e. nr_pages < lvl_pages).
2161 if (!nr_pages || first_pte_in_page(pte) ||
2162 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2163 domain_flush_cache(domain, first_pte,
2164 (void *)pte - (void *)first_pte);
2172 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2174 struct intel_iommu *iommu = info->iommu;
2175 struct context_entry *context;
2178 spin_lock(&iommu->lock);
2179 context = iommu_context_addr(iommu, bus, devfn, 0);
2181 spin_unlock(&iommu->lock);
2185 did_old = context_domain_id(context);
2187 context_clear_entry(context);
2188 __iommu_flush_cache(iommu, context, sizeof(*context));
2189 spin_unlock(&iommu->lock);
2190 iommu->flush.flush_context(iommu,
2192 (((u16)bus) << 8) | devfn,
2193 DMA_CCMD_MASK_NOBIT,
2194 DMA_CCMD_DEVICE_INVL);
2196 iommu->flush.flush_iotlb(iommu,
2202 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2205 static int domain_setup_first_level(struct intel_iommu *iommu,
2206 struct dmar_domain *domain,
2210 struct dma_pte *pgd = domain->pgd;
2215 * Skip top levels of page tables for iommu which has
2216 * less agaw than default. Unnecessary for PT mode.
2218 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2219 pgd = phys_to_virt(dma_pte_addr(pgd));
2220 if (!dma_pte_present(pgd))
2224 level = agaw_to_level(agaw);
2225 if (level != 4 && level != 5)
2229 flags |= PASID_FLAG_FL5LP;
2231 if (domain->force_snooping)
2232 flags |= PASID_FLAG_PAGE_SNOOP;
2234 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2235 domain_id_iommu(domain, iommu),
2239 static bool dev_is_real_dma_subdevice(struct device *dev)
2241 return dev && dev_is_pci(dev) &&
2242 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2245 static int iommu_domain_identity_map(struct dmar_domain *domain,
2246 unsigned long first_vpfn,
2247 unsigned long last_vpfn)
2250 * RMRR range might have overlap with physical memory range,
2253 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2255 return __domain_mapping(domain, first_vpfn,
2256 first_vpfn, last_vpfn - first_vpfn + 1,
2257 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2260 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2262 static int __init si_domain_init(int hw)
2264 struct dmar_rmrr_unit *rmrr;
2268 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2272 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2273 domain_exit(si_domain);
2281 for_each_online_node(nid) {
2282 unsigned long start_pfn, end_pfn;
2285 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2286 ret = iommu_domain_identity_map(si_domain,
2287 mm_to_dma_pfn_start(start_pfn),
2288 mm_to_dma_pfn_end(end_pfn));
2295 * Identity map the RMRRs so that devices with RMRRs could also use
2298 for_each_rmrr_units(rmrr) {
2299 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2301 unsigned long long start = rmrr->base_address;
2302 unsigned long long end = rmrr->end_address;
2304 if (WARN_ON(end < start ||
2305 end >> agaw_to_width(si_domain->agaw)))
2308 ret = iommu_domain_identity_map(si_domain,
2309 mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2310 mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2319 static int dmar_domain_attach_device(struct dmar_domain *domain,
2322 struct device_domain_info *info = dev_iommu_priv_get(dev);
2323 struct intel_iommu *iommu = info->iommu;
2324 unsigned long flags;
2327 ret = domain_attach_iommu(domain, iommu);
2330 info->domain = domain;
2331 spin_lock_irqsave(&domain->lock, flags);
2332 list_add(&info->link, &domain->devices);
2333 spin_unlock_irqrestore(&domain->lock, flags);
2335 if (dev_is_real_dma_subdevice(dev))
2338 if (!sm_supported(iommu))
2339 ret = domain_context_mapping(domain, dev);
2340 else if (hw_pass_through && domain_type_is_si(domain))
2341 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
2342 else if (domain->use_first_level)
2343 ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2345 ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2348 device_block_translation(dev);
2352 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2353 iommu_enable_pci_caps(info);
2359 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2360 * is relaxable (ie. is allowed to be not enforced under some conditions)
2361 * @dev: device handle
2363 * We assume that PCI USB devices with RMRRs have them largely
2364 * for historical reasons and that the RMRR space is not actively used post
2365 * boot. This exclusion may change if vendors begin to abuse it.
2367 * The same exception is made for graphics devices, with the requirement that
2368 * any use of the RMRR regions will be torn down before assigning the device
2371 * Return: true if the RMRR is relaxable, false otherwise
2373 static bool device_rmrr_is_relaxable(struct device *dev)
2375 struct pci_dev *pdev;
2377 if (!dev_is_pci(dev))
2380 pdev = to_pci_dev(dev);
2381 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2388 * Return the required default domain type for a specific device.
2390 * @dev: the device in query
2391 * @startup: true if this is during early boot
2394 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2395 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2396 * - 0: both identity and dynamic domains work for this device
2398 static int device_def_domain_type(struct device *dev)
2400 if (dev_is_pci(dev)) {
2401 struct pci_dev *pdev = to_pci_dev(dev);
2403 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2404 return IOMMU_DOMAIN_IDENTITY;
2406 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2407 return IOMMU_DOMAIN_IDENTITY;
2413 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2416 * Start from the sane iommu hardware state.
2417 * If the queued invalidation is already initialized by us
2418 * (for example, while enabling interrupt-remapping) then
2419 * we got the things already rolling from a sane state.
2423 * Clear any previous faults.
2425 dmar_fault(-1, iommu);
2427 * Disable queued invalidation if supported and already enabled
2428 * before OS handover.
2430 dmar_disable_qi(iommu);
2433 if (dmar_enable_qi(iommu)) {
2435 * Queued Invalidate not enabled, use Register Based Invalidate
2437 iommu->flush.flush_context = __iommu_flush_context;
2438 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2439 pr_info("%s: Using Register based invalidation\n",
2442 iommu->flush.flush_context = qi_flush_context;
2443 iommu->flush.flush_iotlb = qi_flush_iotlb;
2444 pr_info("%s: Using Queued invalidation\n", iommu->name);
2448 static int copy_context_table(struct intel_iommu *iommu,
2449 struct root_entry *old_re,
2450 struct context_entry **tbl,
2453 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2454 struct context_entry *new_ce = NULL, ce;
2455 struct context_entry *old_ce = NULL;
2456 struct root_entry re;
2457 phys_addr_t old_ce_phys;
2459 tbl_idx = ext ? bus * 2 : bus;
2460 memcpy(&re, old_re, sizeof(re));
2462 for (devfn = 0; devfn < 256; devfn++) {
2463 /* First calculate the correct index */
2464 idx = (ext ? devfn * 2 : devfn) % 256;
2467 /* First save what we may have and clean up */
2469 tbl[tbl_idx] = new_ce;
2470 __iommu_flush_cache(iommu, new_ce,
2480 old_ce_phys = root_entry_lctp(&re);
2482 old_ce_phys = root_entry_uctp(&re);
2485 if (ext && devfn == 0) {
2486 /* No LCTP, try UCTP */
2495 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2500 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2507 /* Now copy the context entry */
2508 memcpy(&ce, old_ce + idx, sizeof(ce));
2510 if (!context_present(&ce))
2513 did = context_domain_id(&ce);
2514 if (did >= 0 && did < cap_ndoms(iommu->cap))
2515 set_bit(did, iommu->domain_ids);
2517 set_context_copied(iommu, bus, devfn);
2521 tbl[tbl_idx + pos] = new_ce;
2523 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2532 static int copy_translation_tables(struct intel_iommu *iommu)
2534 struct context_entry **ctxt_tbls;
2535 struct root_entry *old_rt;
2536 phys_addr_t old_rt_phys;
2537 int ctxt_table_entries;
2542 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2543 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2544 new_ext = !!sm_supported(iommu);
2547 * The RTT bit can only be changed when translation is disabled,
2548 * but disabling translation means to open a window for data
2549 * corruption. So bail out and don't copy anything if we would
2550 * have to change the bit.
2555 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2556 if (!iommu->copied_tables)
2559 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2563 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2567 /* This is too big for the stack - allocate it from slab */
2568 ctxt_table_entries = ext ? 512 : 256;
2570 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2574 for (bus = 0; bus < 256; bus++) {
2575 ret = copy_context_table(iommu, &old_rt[bus],
2576 ctxt_tbls, bus, ext);
2578 pr_err("%s: Failed to copy context table for bus %d\n",
2584 spin_lock(&iommu->lock);
2586 /* Context tables are copied, now write them to the root_entry table */
2587 for (bus = 0; bus < 256; bus++) {
2588 int idx = ext ? bus * 2 : bus;
2591 if (ctxt_tbls[idx]) {
2592 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2593 iommu->root_entry[bus].lo = val;
2596 if (!ext || !ctxt_tbls[idx + 1])
2599 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2600 iommu->root_entry[bus].hi = val;
2603 spin_unlock(&iommu->lock);
2607 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2617 static int __init init_dmars(void)
2619 struct dmar_drhd_unit *drhd;
2620 struct intel_iommu *iommu;
2623 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2627 for_each_iommu(iommu, drhd) {
2628 if (drhd->ignored) {
2629 iommu_disable_translation(iommu);
2634 * Find the max pasid size of all IOMMU's in the system.
2635 * We need to ensure the system pasid table is no bigger
2636 * than the smallest supported.
2638 if (pasid_supported(iommu)) {
2639 u32 temp = 2 << ecap_pss(iommu->ecap);
2641 intel_pasid_max_id = min_t(u32, temp,
2642 intel_pasid_max_id);
2645 intel_iommu_init_qi(iommu);
2647 ret = iommu_init_domains(iommu);
2651 init_translation_status(iommu);
2653 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2654 iommu_disable_translation(iommu);
2655 clear_translation_pre_enabled(iommu);
2656 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2662 * we could share the same root & context tables
2663 * among all IOMMU's. Need to Split it later.
2665 ret = iommu_alloc_root_entry(iommu);
2669 if (translation_pre_enabled(iommu)) {
2670 pr_info("Translation already enabled - trying to copy translation structures\n");
2672 ret = copy_translation_tables(iommu);
2675 * We found the IOMMU with translation
2676 * enabled - but failed to copy over the
2677 * old root-entry table. Try to proceed
2678 * by disabling translation now and
2679 * allocating a clean root-entry table.
2680 * This might cause DMAR faults, but
2681 * probably the dump will still succeed.
2683 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2685 iommu_disable_translation(iommu);
2686 clear_translation_pre_enabled(iommu);
2688 pr_info("Copied translation tables from previous kernel for %s\n",
2693 if (!ecap_pass_through(iommu->ecap))
2694 hw_pass_through = 0;
2695 intel_svm_check(iommu);
2699 * Now that qi is enabled on all iommus, set the root entry and flush
2700 * caches. This is required on some Intel X58 chipsets, otherwise the
2701 * flush_context function will loop forever and the boot hangs.
2703 for_each_active_iommu(iommu, drhd) {
2704 iommu_flush_write_buffer(iommu);
2705 iommu_set_root_entry(iommu);
2709 iommu_identity_mapping |= IDENTMAP_GFX;
2711 check_tylersburg_isoch();
2713 ret = si_domain_init(hw_pass_through);
2720 * global invalidate context cache
2721 * global invalidate iotlb
2722 * enable translation
2724 for_each_iommu(iommu, drhd) {
2725 if (drhd->ignored) {
2727 * we always have to disable PMRs or DMA may fail on
2731 iommu_disable_protect_mem_regions(iommu);
2735 iommu_flush_write_buffer(iommu);
2737 #ifdef CONFIG_INTEL_IOMMU_SVM
2738 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2740 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2741 * could cause possible lock race condition.
2743 up_write(&dmar_global_lock);
2744 ret = intel_svm_enable_prq(iommu);
2745 down_write(&dmar_global_lock);
2750 ret = dmar_set_interrupt(iommu);
2758 for_each_active_iommu(iommu, drhd) {
2759 disable_dmar_iommu(iommu);
2760 free_dmar_iommu(iommu);
2763 domain_exit(si_domain);
2770 static void __init init_no_remapping_devices(void)
2772 struct dmar_drhd_unit *drhd;
2776 for_each_drhd_unit(drhd) {
2777 if (!drhd->include_all) {
2778 for_each_active_dev_scope(drhd->devices,
2779 drhd->devices_cnt, i, dev)
2781 /* ignore DMAR unit if no devices exist */
2782 if (i == drhd->devices_cnt)
2787 for_each_active_drhd_unit(drhd) {
2788 if (drhd->include_all)
2791 for_each_active_dev_scope(drhd->devices,
2792 drhd->devices_cnt, i, dev)
2793 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2795 if (i < drhd->devices_cnt)
2798 /* This IOMMU has *only* gfx devices. Either bypass it or
2799 set the gfx_mapped flag, as appropriate */
2800 drhd->gfx_dedicated = 1;
2806 #ifdef CONFIG_SUSPEND
2807 static int init_iommu_hw(void)
2809 struct dmar_drhd_unit *drhd;
2810 struct intel_iommu *iommu = NULL;
2813 for_each_active_iommu(iommu, drhd) {
2815 ret = dmar_reenable_qi(iommu);
2821 for_each_iommu(iommu, drhd) {
2822 if (drhd->ignored) {
2824 * we always have to disable PMRs or DMA may fail on
2828 iommu_disable_protect_mem_regions(iommu);
2832 iommu_flush_write_buffer(iommu);
2833 iommu_set_root_entry(iommu);
2834 iommu_enable_translation(iommu);
2835 iommu_disable_protect_mem_regions(iommu);
2841 static void iommu_flush_all(void)
2843 struct dmar_drhd_unit *drhd;
2844 struct intel_iommu *iommu;
2846 for_each_active_iommu(iommu, drhd) {
2847 iommu->flush.flush_context(iommu, 0, 0, 0,
2848 DMA_CCMD_GLOBAL_INVL);
2849 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2850 DMA_TLB_GLOBAL_FLUSH);
2854 static int iommu_suspend(void)
2856 struct dmar_drhd_unit *drhd;
2857 struct intel_iommu *iommu = NULL;
2862 for_each_active_iommu(iommu, drhd) {
2863 iommu_disable_translation(iommu);
2865 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2867 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2868 readl(iommu->reg + DMAR_FECTL_REG);
2869 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2870 readl(iommu->reg + DMAR_FEDATA_REG);
2871 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2872 readl(iommu->reg + DMAR_FEADDR_REG);
2873 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2874 readl(iommu->reg + DMAR_FEUADDR_REG);
2876 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2881 static void iommu_resume(void)
2883 struct dmar_drhd_unit *drhd;
2884 struct intel_iommu *iommu = NULL;
2887 if (init_iommu_hw()) {
2889 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2891 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2895 for_each_active_iommu(iommu, drhd) {
2897 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2899 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2900 iommu->reg + DMAR_FECTL_REG);
2901 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2902 iommu->reg + DMAR_FEDATA_REG);
2903 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2904 iommu->reg + DMAR_FEADDR_REG);
2905 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2906 iommu->reg + DMAR_FEUADDR_REG);
2908 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2912 static struct syscore_ops iommu_syscore_ops = {
2913 .resume = iommu_resume,
2914 .suspend = iommu_suspend,
2917 static void __init init_iommu_pm_ops(void)
2919 register_syscore_ops(&iommu_syscore_ops);
2923 static inline void init_iommu_pm_ops(void) {}
2924 #endif /* CONFIG_PM */
2926 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2928 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2929 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2930 rmrr->end_address <= rmrr->base_address ||
2931 arch_rmrr_sanity_check(rmrr))
2937 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2939 struct acpi_dmar_reserved_memory *rmrr;
2940 struct dmar_rmrr_unit *rmrru;
2942 rmrr = (struct acpi_dmar_reserved_memory *)header;
2943 if (rmrr_sanity_check(rmrr)) {
2945 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2946 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2947 rmrr->base_address, rmrr->end_address,
2948 dmi_get_system_info(DMI_BIOS_VENDOR),
2949 dmi_get_system_info(DMI_BIOS_VERSION),
2950 dmi_get_system_info(DMI_PRODUCT_VERSION));
2951 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2954 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2958 rmrru->hdr = header;
2960 rmrru->base_address = rmrr->base_address;
2961 rmrru->end_address = rmrr->end_address;
2963 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2964 ((void *)rmrr) + rmrr->header.length,
2965 &rmrru->devices_cnt);
2966 if (rmrru->devices_cnt && rmrru->devices == NULL)
2969 list_add(&rmrru->list, &dmar_rmrr_units);
2978 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2980 struct dmar_atsr_unit *atsru;
2981 struct acpi_dmar_atsr *tmp;
2983 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2985 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2986 if (atsr->segment != tmp->segment)
2988 if (atsr->header.length != tmp->header.length)
2990 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2997 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2999 struct acpi_dmar_atsr *atsr;
3000 struct dmar_atsr_unit *atsru;
3002 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3005 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3006 atsru = dmar_find_atsr(atsr);
3010 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3015 * If memory is allocated from slab by ACPI _DSM method, we need to
3016 * copy the memory content because the memory buffer will be freed
3019 atsru->hdr = (void *)(atsru + 1);
3020 memcpy(atsru->hdr, hdr, hdr->length);
3021 atsru->include_all = atsr->flags & 0x1;
3022 if (!atsru->include_all) {
3023 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3024 (void *)atsr + atsr->header.length,
3025 &atsru->devices_cnt);
3026 if (atsru->devices_cnt && atsru->devices == NULL) {
3032 list_add_rcu(&atsru->list, &dmar_atsr_units);
3037 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3039 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3043 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3045 struct acpi_dmar_atsr *atsr;
3046 struct dmar_atsr_unit *atsru;
3048 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3049 atsru = dmar_find_atsr(atsr);
3051 list_del_rcu(&atsru->list);
3053 intel_iommu_free_atsr(atsru);
3059 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3063 struct acpi_dmar_atsr *atsr;
3064 struct dmar_atsr_unit *atsru;
3066 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3067 atsru = dmar_find_atsr(atsr);
3071 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3072 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3080 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3082 struct dmar_satc_unit *satcu;
3083 struct acpi_dmar_satc *tmp;
3085 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3087 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3088 if (satc->segment != tmp->segment)
3090 if (satc->header.length != tmp->header.length)
3092 if (memcmp(satc, tmp, satc->header.length) == 0)
3099 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3101 struct acpi_dmar_satc *satc;
3102 struct dmar_satc_unit *satcu;
3104 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3107 satc = container_of(hdr, struct acpi_dmar_satc, header);
3108 satcu = dmar_find_satc(satc);
3112 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3116 satcu->hdr = (void *)(satcu + 1);
3117 memcpy(satcu->hdr, hdr, hdr->length);
3118 satcu->atc_required = satc->flags & 0x1;
3119 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3120 (void *)satc + satc->header.length,
3121 &satcu->devices_cnt);
3122 if (satcu->devices_cnt && !satcu->devices) {
3126 list_add_rcu(&satcu->list, &dmar_satc_units);
3131 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3134 struct intel_iommu *iommu = dmaru->iommu;
3136 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3140 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3141 pr_warn("%s: Doesn't support hardware pass through.\n",
3146 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3147 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3148 pr_warn("%s: Doesn't support large page.\n",
3154 * Disable translation if already enabled prior to OS handover.
3156 if (iommu->gcmd & DMA_GCMD_TE)
3157 iommu_disable_translation(iommu);
3159 ret = iommu_init_domains(iommu);
3161 ret = iommu_alloc_root_entry(iommu);
3165 intel_svm_check(iommu);
3167 if (dmaru->ignored) {
3169 * we always have to disable PMRs or DMA may fail on this device
3172 iommu_disable_protect_mem_regions(iommu);
3176 intel_iommu_init_qi(iommu);
3177 iommu_flush_write_buffer(iommu);
3179 #ifdef CONFIG_INTEL_IOMMU_SVM
3180 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3181 ret = intel_svm_enable_prq(iommu);
3186 ret = dmar_set_interrupt(iommu);
3190 iommu_set_root_entry(iommu);
3191 iommu_enable_translation(iommu);
3193 iommu_disable_protect_mem_regions(iommu);
3197 disable_dmar_iommu(iommu);
3199 free_dmar_iommu(iommu);
3203 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3206 struct intel_iommu *iommu = dmaru->iommu;
3208 if (!intel_iommu_enabled)
3214 ret = intel_iommu_add(dmaru);
3216 disable_dmar_iommu(iommu);
3217 free_dmar_iommu(iommu);
3223 static void intel_iommu_free_dmars(void)
3225 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3226 struct dmar_atsr_unit *atsru, *atsr_n;
3227 struct dmar_satc_unit *satcu, *satc_n;
3229 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3230 list_del(&rmrru->list);
3231 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3235 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3236 list_del(&atsru->list);
3237 intel_iommu_free_atsr(atsru);
3239 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3240 list_del(&satcu->list);
3241 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3246 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3248 struct dmar_satc_unit *satcu;
3249 struct acpi_dmar_satc *satc;
3253 dev = pci_physfn(dev);
3256 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3257 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3258 if (satc->segment != pci_domain_nr(dev->bus))
3260 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3261 if (to_pci_dev(tmp) == dev)
3270 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3273 struct pci_bus *bus;
3274 struct pci_dev *bridge = NULL;
3276 struct acpi_dmar_atsr *atsr;
3277 struct dmar_atsr_unit *atsru;
3278 struct dmar_satc_unit *satcu;
3280 dev = pci_physfn(dev);
3281 satcu = dmar_find_matched_satc_unit(dev);
3284 * This device supports ATS as it is in SATC table.
3285 * When IOMMU is in legacy mode, enabling ATS is done
3286 * automatically by HW for the device that requires
3287 * ATS, hence OS should not enable this device ATS
3288 * to avoid duplicated TLB invalidation.
3290 return !(satcu->atc_required && !sm_supported(iommu));
3292 for (bus = dev->bus; bus; bus = bus->parent) {
3294 /* If it's an integrated device, allow ATS */
3297 /* Connected via non-PCIe: no ATS */
3298 if (!pci_is_pcie(bridge) ||
3299 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3301 /* If we found the root port, look it up in the ATSR */
3302 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3307 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3308 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3309 if (atsr->segment != pci_domain_nr(dev->bus))
3312 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3313 if (tmp == &bridge->dev)
3316 if (atsru->include_all)
3326 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3329 struct dmar_rmrr_unit *rmrru;
3330 struct dmar_atsr_unit *atsru;
3331 struct dmar_satc_unit *satcu;
3332 struct acpi_dmar_atsr *atsr;
3333 struct acpi_dmar_reserved_memory *rmrr;
3334 struct acpi_dmar_satc *satc;
3336 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3339 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3340 rmrr = container_of(rmrru->hdr,
3341 struct acpi_dmar_reserved_memory, header);
3342 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3343 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3344 ((void *)rmrr) + rmrr->header.length,
3345 rmrr->segment, rmrru->devices,
3346 rmrru->devices_cnt);
3349 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3350 dmar_remove_dev_scope(info, rmrr->segment,
3351 rmrru->devices, rmrru->devices_cnt);
3355 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3356 if (atsru->include_all)
3359 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3360 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3361 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3362 (void *)atsr + atsr->header.length,
3363 atsr->segment, atsru->devices,
3364 atsru->devices_cnt);
3369 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3370 if (dmar_remove_dev_scope(info, atsr->segment,
3371 atsru->devices, atsru->devices_cnt))
3375 list_for_each_entry(satcu, &dmar_satc_units, list) {
3376 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3377 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3378 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3379 (void *)satc + satc->header.length,
3380 satc->segment, satcu->devices,
3381 satcu->devices_cnt);
3386 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3387 if (dmar_remove_dev_scope(info, satc->segment,
3388 satcu->devices, satcu->devices_cnt))
3396 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3397 unsigned long val, void *v)
3399 struct memory_notify *mhp = v;
3400 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3401 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3405 case MEM_GOING_ONLINE:
3406 if (iommu_domain_identity_map(si_domain,
3407 start_vpfn, last_vpfn)) {
3408 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3409 start_vpfn, last_vpfn);
3415 case MEM_CANCEL_ONLINE:
3417 struct dmar_drhd_unit *drhd;
3418 struct intel_iommu *iommu;
3419 LIST_HEAD(freelist);
3421 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3424 for_each_active_iommu(iommu, drhd)
3425 iommu_flush_iotlb_psi(iommu, si_domain,
3426 start_vpfn, mhp->nr_pages,
3427 list_empty(&freelist), 0);
3429 put_pages_list(&freelist);
3437 static struct notifier_block intel_iommu_memory_nb = {
3438 .notifier_call = intel_iommu_memory_notifier,
3442 static void intel_disable_iommus(void)
3444 struct intel_iommu *iommu = NULL;
3445 struct dmar_drhd_unit *drhd;
3447 for_each_iommu(iommu, drhd)
3448 iommu_disable_translation(iommu);
3451 void intel_iommu_shutdown(void)
3453 struct dmar_drhd_unit *drhd;
3454 struct intel_iommu *iommu = NULL;
3456 if (no_iommu || dmar_disabled)
3459 down_write(&dmar_global_lock);
3461 /* Disable PMRs explicitly here. */
3462 for_each_iommu(iommu, drhd)
3463 iommu_disable_protect_mem_regions(iommu);
3465 /* Make sure the IOMMUs are switched off */
3466 intel_disable_iommus();
3468 up_write(&dmar_global_lock);
3471 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3473 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3475 return container_of(iommu_dev, struct intel_iommu, iommu);
3478 static ssize_t version_show(struct device *dev,
3479 struct device_attribute *attr, char *buf)
3481 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3482 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3483 return sysfs_emit(buf, "%d:%d\n",
3484 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3486 static DEVICE_ATTR_RO(version);
3488 static ssize_t address_show(struct device *dev,
3489 struct device_attribute *attr, char *buf)
3491 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3492 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3494 static DEVICE_ATTR_RO(address);
3496 static ssize_t cap_show(struct device *dev,
3497 struct device_attribute *attr, char *buf)
3499 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3500 return sysfs_emit(buf, "%llx\n", iommu->cap);
3502 static DEVICE_ATTR_RO(cap);
3504 static ssize_t ecap_show(struct device *dev,
3505 struct device_attribute *attr, char *buf)
3507 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3508 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3510 static DEVICE_ATTR_RO(ecap);
3512 static ssize_t domains_supported_show(struct device *dev,
3513 struct device_attribute *attr, char *buf)
3515 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3516 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3518 static DEVICE_ATTR_RO(domains_supported);
3520 static ssize_t domains_used_show(struct device *dev,
3521 struct device_attribute *attr, char *buf)
3523 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3524 return sysfs_emit(buf, "%d\n",
3525 bitmap_weight(iommu->domain_ids,
3526 cap_ndoms(iommu->cap)));
3528 static DEVICE_ATTR_RO(domains_used);
3530 static struct attribute *intel_iommu_attrs[] = {
3531 &dev_attr_version.attr,
3532 &dev_attr_address.attr,
3534 &dev_attr_ecap.attr,
3535 &dev_attr_domains_supported.attr,
3536 &dev_attr_domains_used.attr,
3540 static struct attribute_group intel_iommu_group = {
3541 .name = "intel-iommu",
3542 .attrs = intel_iommu_attrs,
3545 const struct attribute_group *intel_iommu_groups[] = {
3550 static bool has_external_pci(void)
3552 struct pci_dev *pdev = NULL;
3554 for_each_pci_dev(pdev)
3555 if (pdev->external_facing) {
3563 static int __init platform_optin_force_iommu(void)
3565 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3568 if (no_iommu || dmar_disabled)
3569 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3572 * If Intel-IOMMU is disabled by default, we will apply identity
3573 * map for all devices except those marked as being untrusted.
3576 iommu_set_default_passthrough(false);
3584 static int __init probe_acpi_namespace_devices(void)
3586 struct dmar_drhd_unit *drhd;
3587 /* To avoid a -Wunused-but-set-variable warning. */
3588 struct intel_iommu *iommu __maybe_unused;
3592 for_each_active_iommu(iommu, drhd) {
3593 for_each_active_dev_scope(drhd->devices,
3594 drhd->devices_cnt, i, dev) {
3595 struct acpi_device_physical_node *pn;
3596 struct acpi_device *adev;
3598 if (dev->bus != &acpi_bus_type)
3601 adev = to_acpi_device(dev);
3602 mutex_lock(&adev->physical_node_lock);
3603 list_for_each_entry(pn,
3604 &adev->physical_node_list, node) {
3605 ret = iommu_probe_device(pn->dev);
3609 mutex_unlock(&adev->physical_node_lock);
3619 static __init int tboot_force_iommu(void)
3621 if (!tboot_enabled())
3624 if (no_iommu || dmar_disabled)
3625 pr_warn("Forcing Intel-IOMMU to enabled\n");
3633 int __init intel_iommu_init(void)
3636 struct dmar_drhd_unit *drhd;
3637 struct intel_iommu *iommu;
3640 * Intel IOMMU is required for a TXT/tboot launch or platform
3641 * opt in, so enforce that.
3643 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3644 platform_optin_force_iommu();
3646 down_write(&dmar_global_lock);
3647 if (dmar_table_init()) {
3649 panic("tboot: Failed to initialize DMAR table\n");
3653 if (dmar_dev_scope_init() < 0) {
3655 panic("tboot: Failed to initialize DMAR device scope\n");
3659 up_write(&dmar_global_lock);
3662 * The bus notifier takes the dmar_global_lock, so lockdep will
3663 * complain later when we register it under the lock.
3665 dmar_register_bus_notifier();
3667 down_write(&dmar_global_lock);
3670 intel_iommu_debugfs_init();
3672 if (no_iommu || dmar_disabled) {
3674 * We exit the function here to ensure IOMMU's remapping and
3675 * mempool aren't setup, which means that the IOMMU's PMRs
3676 * won't be disabled via the call to init_dmars(). So disable
3677 * it explicitly here. The PMRs were setup by tboot prior to
3678 * calling SENTER, but the kernel is expected to reset/tear
3681 if (intel_iommu_tboot_noforce) {
3682 for_each_iommu(iommu, drhd)
3683 iommu_disable_protect_mem_regions(iommu);
3687 * Make sure the IOMMUs are switched off, even when we
3688 * boot into a kexec kernel and the previous kernel left
3691 intel_disable_iommus();
3695 if (list_empty(&dmar_rmrr_units))
3696 pr_info("No RMRR found\n");
3698 if (list_empty(&dmar_atsr_units))
3699 pr_info("No ATSR found\n");
3701 if (list_empty(&dmar_satc_units))
3702 pr_info("No SATC found\n");
3704 init_no_remapping_devices();
3709 panic("tboot: Failed to initialize DMARs\n");
3710 pr_err("Initialization failed\n");
3713 up_write(&dmar_global_lock);
3715 init_iommu_pm_ops();
3717 down_read(&dmar_global_lock);
3718 for_each_active_iommu(iommu, drhd) {
3720 * The flush queue implementation does not perform
3721 * page-selective invalidations that are required for efficient
3722 * TLB flushes in virtual environments. The benefit of batching
3723 * is likely to be much lower than the overhead of synchronizing
3724 * the virtual and physical IOMMU page-tables.
3726 if (cap_caching_mode(iommu->cap) &&
3727 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3728 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3729 iommu_set_dma_strict();
3731 iommu_device_sysfs_add(&iommu->iommu, NULL,
3734 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3736 iommu_pmu_register(iommu);
3738 up_read(&dmar_global_lock);
3740 if (si_domain && !hw_pass_through)
3741 register_memory_notifier(&intel_iommu_memory_nb);
3743 down_read(&dmar_global_lock);
3744 if (probe_acpi_namespace_devices())
3745 pr_warn("ACPI name space devices didn't probe correctly\n");
3747 /* Finally, we enable the DMA remapping hardware. */
3748 for_each_iommu(iommu, drhd) {
3749 if (!drhd->ignored && !translation_pre_enabled(iommu))
3750 iommu_enable_translation(iommu);
3752 iommu_disable_protect_mem_regions(iommu);
3754 up_read(&dmar_global_lock);
3756 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3758 intel_iommu_enabled = 1;
3763 intel_iommu_free_dmars();
3764 up_write(&dmar_global_lock);
3768 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3770 struct device_domain_info *info = opaque;
3772 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3777 * NB - intel-iommu lacks any sort of reference counting for the users of
3778 * dependent devices. If multiple endpoints have intersecting dependent
3779 * devices, unbinding the driver from any one of them will possibly leave
3780 * the others unable to operate.
3782 static void domain_context_clear(struct device_domain_info *info)
3784 if (!dev_is_pci(info->dev))
3785 domain_context_clear_one(info, info->bus, info->devfn);
3787 pci_for_each_dma_alias(to_pci_dev(info->dev),
3788 &domain_context_clear_one_cb, info);
3792 * Clear the page table pointer in context or pasid table entries so that
3793 * all DMA requests without PASID from the device are blocked. If the page
3794 * table has been set, clean up the data structures.
3796 void device_block_translation(struct device *dev)
3798 struct device_domain_info *info = dev_iommu_priv_get(dev);
3799 struct intel_iommu *iommu = info->iommu;
3800 unsigned long flags;
3802 iommu_disable_pci_caps(info);
3803 if (!dev_is_real_dma_subdevice(dev)) {
3804 if (sm_supported(iommu))
3805 intel_pasid_tear_down_entry(iommu, dev,
3806 IOMMU_NO_PASID, false);
3808 domain_context_clear(info);
3814 spin_lock_irqsave(&info->domain->lock, flags);
3815 list_del(&info->link);
3816 spin_unlock_irqrestore(&info->domain->lock, flags);
3818 domain_detach_iommu(info->domain, iommu);
3819 info->domain = NULL;
3822 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3826 /* calculate AGAW */
3827 domain->gaw = guest_width;
3828 adjust_width = guestwidth_to_adjustwidth(guest_width);
3829 domain->agaw = width_to_agaw(adjust_width);
3831 domain->iommu_coherency = false;
3832 domain->iommu_superpage = 0;
3833 domain->max_addr = 0;
3835 /* always allocate the top pgd */
3836 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3839 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3843 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3846 device_block_translation(dev);
3850 static struct iommu_domain blocking_domain = {
3851 .type = IOMMU_DOMAIN_BLOCKED,
3852 .ops = &(const struct iommu_domain_ops) {
3853 .attach_dev = blocking_domain_attach_dev,
3857 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3859 struct dmar_domain *dmar_domain;
3860 struct iommu_domain *domain;
3863 case IOMMU_DOMAIN_DMA:
3864 case IOMMU_DOMAIN_UNMANAGED:
3865 dmar_domain = alloc_domain(type);
3867 pr_err("Can't allocate dmar_domain\n");
3870 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3871 pr_err("Domain initialization failed\n");
3872 domain_exit(dmar_domain);
3876 domain = &dmar_domain->domain;
3877 domain->geometry.aperture_start = 0;
3878 domain->geometry.aperture_end =
3879 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3880 domain->geometry.force_aperture = true;
3883 case IOMMU_DOMAIN_IDENTITY:
3884 return &si_domain->domain;
3885 case IOMMU_DOMAIN_SVA:
3886 return intel_svm_domain_alloc();
3894 static struct iommu_domain *
3895 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3896 struct iommu_domain *parent,
3897 const struct iommu_user_data *user_data)
3899 struct device_domain_info *info = dev_iommu_priv_get(dev);
3900 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3901 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3902 struct intel_iommu *iommu = info->iommu;
3903 struct dmar_domain *dmar_domain;
3904 struct iommu_domain *domain;
3906 /* Must be NESTING domain */
3908 if (!nested_supported(iommu) || flags)
3909 return ERR_PTR(-EOPNOTSUPP);
3910 return intel_nested_domain_alloc(parent, user_data);
3914 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3915 return ERR_PTR(-EOPNOTSUPP);
3916 if (nested_parent && !nested_supported(iommu))
3917 return ERR_PTR(-EOPNOTSUPP);
3918 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3919 return ERR_PTR(-EOPNOTSUPP);
3922 * domain_alloc_user op needs to fully initialize a domain before
3923 * return, so uses iommu_domain_alloc() here for simple.
3925 domain = iommu_domain_alloc(dev->bus);
3927 return ERR_PTR(-ENOMEM);
3929 dmar_domain = to_dmar_domain(domain);
3931 if (nested_parent) {
3932 dmar_domain->nested_parent = true;
3933 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3934 spin_lock_init(&dmar_domain->s1_lock);
3937 if (dirty_tracking) {
3938 if (dmar_domain->use_first_level) {
3939 iommu_domain_free(domain);
3940 return ERR_PTR(-EOPNOTSUPP);
3942 domain->dirty_ops = &intel_dirty_ops;
3948 static void intel_iommu_domain_free(struct iommu_domain *domain)
3950 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3952 WARN_ON(dmar_domain->nested_parent &&
3953 !list_empty(&dmar_domain->s1_domains));
3954 if (domain != &si_domain->domain)
3955 domain_exit(dmar_domain);
3958 int prepare_domain_attach_device(struct iommu_domain *domain,
3961 struct device_domain_info *info = dev_iommu_priv_get(dev);
3962 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3963 struct intel_iommu *iommu = info->iommu;
3966 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3969 if (domain->dirty_ops && !ssads_supported(iommu))
3972 /* check if this iommu agaw is sufficient for max mapped address */
3973 addr_width = agaw_to_width(iommu->agaw);
3974 if (addr_width > cap_mgaw(iommu->cap))
3975 addr_width = cap_mgaw(iommu->cap);
3977 if (dmar_domain->max_addr > (1LL << addr_width))
3979 dmar_domain->gaw = addr_width;
3982 * Knock out extra levels of page tables if necessary
3984 while (iommu->agaw < dmar_domain->agaw) {
3985 struct dma_pte *pte;
3987 pte = dmar_domain->pgd;
3988 if (dma_pte_present(pte)) {
3989 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3990 free_pgtable_page(pte);
3992 dmar_domain->agaw--;
3995 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3996 context_copied(iommu, info->bus, info->devfn))
3997 return intel_pasid_setup_sm_context(dev);
4002 static int intel_iommu_attach_device(struct iommu_domain *domain,
4005 struct device_domain_info *info = dev_iommu_priv_get(dev);
4009 device_block_translation(dev);
4011 ret = prepare_domain_attach_device(domain, dev);
4015 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4018 static int intel_iommu_map(struct iommu_domain *domain,
4019 unsigned long iova, phys_addr_t hpa,
4020 size_t size, int iommu_prot, gfp_t gfp)
4022 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4026 if (iommu_prot & IOMMU_READ)
4027 prot |= DMA_PTE_READ;
4028 if (iommu_prot & IOMMU_WRITE)
4029 prot |= DMA_PTE_WRITE;
4030 if (dmar_domain->set_pte_snp)
4031 prot |= DMA_PTE_SNP;
4033 max_addr = iova + size;
4034 if (dmar_domain->max_addr < max_addr) {
4037 /* check if minimum agaw is sufficient for mapped address */
4038 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4039 if (end < max_addr) {
4040 pr_err("%s: iommu width (%d) is not "
4041 "sufficient for the mapped address (%llx)\n",
4042 __func__, dmar_domain->gaw, max_addr);
4045 dmar_domain->max_addr = max_addr;
4047 /* Round up size to next multiple of PAGE_SIZE, if it and
4048 the low bits of hpa would take us onto the next page */
4049 size = aligned_nrpages(hpa, size);
4050 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4051 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4054 static int intel_iommu_map_pages(struct iommu_domain *domain,
4055 unsigned long iova, phys_addr_t paddr,
4056 size_t pgsize, size_t pgcount,
4057 int prot, gfp_t gfp, size_t *mapped)
4059 unsigned long pgshift = __ffs(pgsize);
4060 size_t size = pgcount << pgshift;
4063 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4066 if (!IS_ALIGNED(iova | paddr, pgsize))
4069 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4076 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4077 unsigned long iova, size_t size,
4078 struct iommu_iotlb_gather *gather)
4080 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4081 unsigned long start_pfn, last_pfn;
4084 /* Cope with horrid API which requires us to unmap more than the
4085 size argument if it happens to be a large-page mapping. */
4086 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4087 &level, GFP_ATOMIC)))
4090 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4091 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4093 start_pfn = iova >> VTD_PAGE_SHIFT;
4094 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4096 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4098 if (dmar_domain->max_addr == iova + size)
4099 dmar_domain->max_addr = iova;
4102 * We do not use page-selective IOTLB invalidation in flush queue,
4103 * so there is no need to track page and sync iotlb.
4105 if (!iommu_iotlb_gather_queued(gather))
4106 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4111 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4113 size_t pgsize, size_t pgcount,
4114 struct iommu_iotlb_gather *gather)
4116 unsigned long pgshift = __ffs(pgsize);
4117 size_t size = pgcount << pgshift;
4119 return intel_iommu_unmap(domain, iova, size, gather);
4122 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4123 struct iommu_iotlb_gather *gather)
4125 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4126 unsigned long iova_pfn = IOVA_PFN(gather->start);
4127 size_t size = gather->end - gather->start;
4128 struct iommu_domain_info *info;
4129 unsigned long start_pfn;
4130 unsigned long nrpages;
4133 nrpages = aligned_nrpages(gather->start, size);
4134 start_pfn = mm_to_dma_pfn_start(iova_pfn);
4136 xa_for_each(&dmar_domain->iommu_array, i, info)
4137 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4139 list_empty(&gather->freelist), 0);
4141 if (dmar_domain->nested_parent)
4142 parent_domain_flush(dmar_domain, start_pfn, nrpages,
4143 list_empty(&gather->freelist));
4144 put_pages_list(&gather->freelist);
4147 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4150 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4151 struct dma_pte *pte;
4155 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4157 if (pte && dma_pte_present(pte))
4158 phys = dma_pte_addr(pte) +
4159 (iova & (BIT_MASK(level_to_offset_bits(level) +
4160 VTD_PAGE_SHIFT) - 1));
4165 static bool domain_support_force_snooping(struct dmar_domain *domain)
4167 struct device_domain_info *info;
4168 bool support = true;
4170 assert_spin_locked(&domain->lock);
4171 list_for_each_entry(info, &domain->devices, link) {
4172 if (!ecap_sc_support(info->iommu->ecap)) {
4181 static void domain_set_force_snooping(struct dmar_domain *domain)
4183 struct device_domain_info *info;
4185 assert_spin_locked(&domain->lock);
4187 * Second level page table supports per-PTE snoop control. The
4188 * iommu_map() interface will handle this by setting SNP bit.
4190 if (!domain->use_first_level) {
4191 domain->set_pte_snp = true;
4195 list_for_each_entry(info, &domain->devices, link)
4196 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4200 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4202 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4203 unsigned long flags;
4205 if (dmar_domain->force_snooping)
4208 spin_lock_irqsave(&dmar_domain->lock, flags);
4209 if (!domain_support_force_snooping(dmar_domain) ||
4210 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4211 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4215 domain_set_force_snooping(dmar_domain);
4216 dmar_domain->force_snooping = true;
4217 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4222 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4224 struct device_domain_info *info = dev_iommu_priv_get(dev);
4227 case IOMMU_CAP_CACHE_COHERENCY:
4228 case IOMMU_CAP_DEFERRED_FLUSH:
4230 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4231 return dmar_platform_optin();
4232 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4233 return ecap_sc_support(info->iommu->ecap);
4234 case IOMMU_CAP_DIRTY_TRACKING:
4235 return ssads_supported(info->iommu);
4241 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4243 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4244 struct device_domain_info *info;
4245 struct intel_iommu *iommu;
4249 iommu = device_lookup_iommu(dev, &bus, &devfn);
4250 if (!iommu || !iommu->iommu.ops)
4251 return ERR_PTR(-ENODEV);
4253 info = kzalloc(sizeof(*info), GFP_KERNEL);
4255 return ERR_PTR(-ENOMEM);
4257 if (dev_is_real_dma_subdevice(dev)) {
4258 info->bus = pdev->bus->number;
4259 info->devfn = pdev->devfn;
4260 info->segment = pci_domain_nr(pdev->bus);
4263 info->devfn = devfn;
4264 info->segment = iommu->segment;
4268 info->iommu = iommu;
4269 if (dev_is_pci(dev)) {
4270 if (ecap_dev_iotlb_support(iommu->ecap) &&
4271 pci_ats_supported(pdev) &&
4272 dmar_ats_supported(pdev, iommu)) {
4273 info->ats_supported = 1;
4274 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4277 * For IOMMU that supports device IOTLB throttling
4278 * (DIT), we assign PFSID to the invalidation desc
4279 * of a VF such that IOMMU HW can gauge queue depth
4280 * at PF level. If DIT is not set, PFSID will be
4281 * treated as reserved, which should be set to 0.
4283 if (ecap_dit(iommu->ecap))
4284 info->pfsid = pci_dev_id(pci_physfn(pdev));
4285 info->ats_qdep = pci_ats_queue_depth(pdev);
4287 if (sm_supported(iommu)) {
4288 if (pasid_supported(iommu)) {
4289 int features = pci_pasid_features(pdev);
4292 info->pasid_supported = features | 1;
4295 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4296 pci_pri_supported(pdev))
4297 info->pri_supported = 1;
4301 dev_iommu_priv_set(dev, info);
4302 if (pdev && pci_ats_supported(pdev)) {
4303 ret = device_rbtree_insert(iommu, info);
4308 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4309 ret = intel_pasid_alloc_table(dev);
4311 dev_err(dev, "PASID table allocation failed\n");
4315 if (!context_copied(iommu, info->bus, info->devfn)) {
4316 ret = intel_pasid_setup_sm_context(dev);
4322 intel_iommu_debugfs_create_dev(info);
4324 return &iommu->iommu;
4326 intel_pasid_free_table(dev);
4328 device_rbtree_remove(info);
4332 return ERR_PTR(ret);
4335 static void intel_iommu_release_device(struct device *dev)
4337 struct device_domain_info *info = dev_iommu_priv_get(dev);
4338 struct intel_iommu *iommu = info->iommu;
4340 mutex_lock(&iommu->iopf_lock);
4341 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4342 device_rbtree_remove(info);
4343 mutex_unlock(&iommu->iopf_lock);
4345 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4346 !context_copied(iommu, info->bus, info->devfn))
4347 intel_pasid_teardown_sm_context(dev);
4349 intel_pasid_free_table(dev);
4350 intel_iommu_debugfs_remove_dev(info);
4352 set_dma_ops(dev, NULL);
4355 static void intel_iommu_probe_finalize(struct device *dev)
4357 set_dma_ops(dev, NULL);
4358 iommu_setup_dma_ops(dev, 0, U64_MAX);
4361 static void intel_iommu_get_resv_regions(struct device *device,
4362 struct list_head *head)
4364 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4365 struct iommu_resv_region *reg;
4366 struct dmar_rmrr_unit *rmrr;
4367 struct device *i_dev;
4371 for_each_rmrr_units(rmrr) {
4372 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4374 struct iommu_resv_region *resv;
4375 enum iommu_resv_type type;
4378 if (i_dev != device &&
4379 !is_downstream_to_pci_bridge(device, i_dev))
4382 length = rmrr->end_address - rmrr->base_address + 1;
4384 type = device_rmrr_is_relaxable(device) ?
4385 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4387 resv = iommu_alloc_resv_region(rmrr->base_address,
4393 list_add_tail(&resv->list, head);
4398 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4399 if (dev_is_pci(device)) {
4400 struct pci_dev *pdev = to_pci_dev(device);
4402 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4403 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4404 IOMMU_RESV_DIRECT_RELAXABLE,
4407 list_add_tail(®->list, head);
4410 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4412 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4413 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4414 0, IOMMU_RESV_MSI, GFP_KERNEL);
4417 list_add_tail(®->list, head);
4420 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4422 if (dev_is_pci(dev))
4423 return pci_device_group(dev);
4424 return generic_device_group(dev);
4427 static int intel_iommu_enable_sva(struct device *dev)
4429 struct device_domain_info *info = dev_iommu_priv_get(dev);
4430 struct intel_iommu *iommu;
4432 if (!info || dmar_disabled)
4435 iommu = info->iommu;
4439 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4442 if (!info->pasid_enabled || !info->ats_enabled)
4446 * Devices having device-specific I/O fault handling should not
4447 * support PCI/PRI. The IOMMU side has no means to check the
4448 * capability of device-specific IOPF. Therefore, IOMMU can only
4449 * default that if the device driver enables SVA on a non-PRI
4450 * device, it will handle IOPF in its own way.
4452 if (!info->pri_supported)
4455 /* Devices supporting PRI should have it enabled. */
4456 if (!info->pri_enabled)
4462 static int intel_iommu_enable_iopf(struct device *dev)
4464 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4465 struct device_domain_info *info = dev_iommu_priv_get(dev);
4466 struct intel_iommu *iommu;
4469 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4472 if (info->pri_enabled)
4475 iommu = info->iommu;
4479 /* PASID is required in PRG Response Message. */
4480 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4483 ret = pci_reset_pri(pdev);
4487 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4491 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4493 iopf_queue_remove_device(iommu->iopf_queue, dev);
4497 info->pri_enabled = 1;
4502 static int intel_iommu_disable_iopf(struct device *dev)
4504 struct device_domain_info *info = dev_iommu_priv_get(dev);
4505 struct intel_iommu *iommu = info->iommu;
4507 if (!info->pri_enabled)
4511 * PCIe spec states that by clearing PRI enable bit, the Page
4512 * Request Interface will not issue new page requests, but has
4513 * outstanding page requests that have been transmitted or are
4514 * queued for transmission. This is supposed to be called after
4515 * the device driver has stopped DMA, all PASIDs have been
4516 * unbound and the outstanding PRQs have been drained.
4518 pci_disable_pri(to_pci_dev(dev));
4519 info->pri_enabled = 0;
4520 iopf_queue_remove_device(iommu->iopf_queue, dev);
4526 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4529 case IOMMU_DEV_FEAT_IOPF:
4530 return intel_iommu_enable_iopf(dev);
4532 case IOMMU_DEV_FEAT_SVA:
4533 return intel_iommu_enable_sva(dev);
4541 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4544 case IOMMU_DEV_FEAT_IOPF:
4545 return intel_iommu_disable_iopf(dev);
4547 case IOMMU_DEV_FEAT_SVA:
4555 static bool intel_iommu_is_attach_deferred(struct device *dev)
4557 struct device_domain_info *info = dev_iommu_priv_get(dev);
4559 return translation_pre_enabled(info->iommu) && !info->domain;
4563 * Check that the device does not live on an external facing PCI port that is
4564 * marked as untrusted. Such devices should not be able to apply quirks and
4565 * thus not be able to bypass the IOMMU restrictions.
4567 static bool risky_device(struct pci_dev *pdev)
4569 if (pdev->untrusted) {
4571 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4572 pdev->vendor, pdev->device);
4573 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4579 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4580 unsigned long iova, size_t size)
4582 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4583 unsigned long pages = aligned_nrpages(iova, size);
4584 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4585 struct iommu_domain_info *info;
4588 xa_for_each(&dmar_domain->iommu_array, i, info)
4589 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4593 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4595 struct device_domain_info *info = dev_iommu_priv_get(dev);
4596 struct dev_pasid_info *curr, *dev_pasid = NULL;
4597 struct intel_iommu *iommu = info->iommu;
4598 struct dmar_domain *dmar_domain;
4599 struct iommu_domain *domain;
4600 unsigned long flags;
4602 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4603 if (WARN_ON_ONCE(!domain))
4607 * The SVA implementation needs to handle its own stuffs like the mm
4608 * notification. Before consolidating that code into iommu core, let
4609 * the intel sva code handle it.
4611 if (domain->type == IOMMU_DOMAIN_SVA) {
4612 intel_svm_remove_dev_pasid(dev, pasid);
4616 dmar_domain = to_dmar_domain(domain);
4617 spin_lock_irqsave(&dmar_domain->lock, flags);
4618 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4619 if (curr->dev == dev && curr->pasid == pasid) {
4620 list_del(&curr->link_domain);
4625 WARN_ON_ONCE(!dev_pasid);
4626 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4628 domain_detach_iommu(dmar_domain, iommu);
4629 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4632 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4633 intel_drain_pasid_prq(dev, pasid);
4636 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4637 struct device *dev, ioasid_t pasid)
4639 struct device_domain_info *info = dev_iommu_priv_get(dev);
4640 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4641 struct intel_iommu *iommu = info->iommu;
4642 struct dev_pasid_info *dev_pasid;
4643 unsigned long flags;
4646 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4649 if (domain->dirty_ops)
4652 if (context_copied(iommu, info->bus, info->devfn))
4655 ret = prepare_domain_attach_device(domain, dev);
4659 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4663 ret = domain_attach_iommu(dmar_domain, iommu);
4667 if (domain_type_is_si(dmar_domain))
4668 ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4669 else if (dmar_domain->use_first_level)
4670 ret = domain_setup_first_level(iommu, dmar_domain,
4673 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4676 goto out_detach_iommu;
4678 dev_pasid->dev = dev;
4679 dev_pasid->pasid = pasid;
4680 spin_lock_irqsave(&dmar_domain->lock, flags);
4681 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4682 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4684 if (domain->type & __IOMMU_DOMAIN_PAGING)
4685 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4689 domain_detach_iommu(dmar_domain, iommu);
4695 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4697 struct device_domain_info *info = dev_iommu_priv_get(dev);
4698 struct intel_iommu *iommu = info->iommu;
4699 struct iommu_hw_info_vtd *vtd;
4701 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4703 return ERR_PTR(-ENOMEM);
4705 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4706 vtd->cap_reg = iommu->cap;
4707 vtd->ecap_reg = iommu->ecap;
4708 *length = sizeof(*vtd);
4709 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4714 * Set dirty tracking for the device list of a domain. The caller must
4715 * hold the domain->lock when calling it.
4717 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4719 struct device_domain_info *info;
4722 list_for_each_entry(info, devices, link) {
4723 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4724 IOMMU_NO_PASID, enable);
4732 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4735 struct dmar_domain *s1_domain;
4736 unsigned long flags;
4739 spin_lock(&domain->s1_lock);
4740 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4741 spin_lock_irqsave(&s1_domain->lock, flags);
4742 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4743 spin_unlock_irqrestore(&s1_domain->lock, flags);
4747 spin_unlock(&domain->s1_lock);
4751 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4752 spin_lock_irqsave(&s1_domain->lock, flags);
4753 device_set_dirty_tracking(&s1_domain->devices,
4754 domain->dirty_tracking);
4755 spin_unlock_irqrestore(&s1_domain->lock, flags);
4757 spin_unlock(&domain->s1_lock);
4761 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4764 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4767 spin_lock(&dmar_domain->lock);
4768 if (dmar_domain->dirty_tracking == enable)
4771 ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4775 if (dmar_domain->nested_parent) {
4776 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4781 dmar_domain->dirty_tracking = enable;
4783 spin_unlock(&dmar_domain->lock);
4788 device_set_dirty_tracking(&dmar_domain->devices,
4789 dmar_domain->dirty_tracking);
4790 spin_unlock(&dmar_domain->lock);
4794 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4795 unsigned long iova, size_t size,
4796 unsigned long flags,
4797 struct iommu_dirty_bitmap *dirty)
4799 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4800 unsigned long end = iova + size - 1;
4801 unsigned long pgsize;
4804 * IOMMUFD core calls into a dirty tracking disabled domain without an
4805 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4806 * have occurred when we stopped dirty tracking. This ensures that we
4807 * never inherit dirtied bits from a previous cycle.
4809 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4813 struct dma_pte *pte;
4816 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4818 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4819 if (!pte || !dma_pte_present(pte)) {
4824 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4825 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4827 } while (iova < end);
4832 static const struct iommu_dirty_ops intel_dirty_ops = {
4833 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4834 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4837 const struct iommu_ops intel_iommu_ops = {
4838 .blocked_domain = &blocking_domain,
4839 .release_domain = &blocking_domain,
4840 .capable = intel_iommu_capable,
4841 .hw_info = intel_iommu_hw_info,
4842 .domain_alloc = intel_iommu_domain_alloc,
4843 .domain_alloc_user = intel_iommu_domain_alloc_user,
4844 .probe_device = intel_iommu_probe_device,
4845 .probe_finalize = intel_iommu_probe_finalize,
4846 .release_device = intel_iommu_release_device,
4847 .get_resv_regions = intel_iommu_get_resv_regions,
4848 .device_group = intel_iommu_device_group,
4849 .dev_enable_feat = intel_iommu_dev_enable_feat,
4850 .dev_disable_feat = intel_iommu_dev_disable_feat,
4851 .is_attach_deferred = intel_iommu_is_attach_deferred,
4852 .def_domain_type = device_def_domain_type,
4853 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4854 .pgsize_bitmap = SZ_4K,
4855 #ifdef CONFIG_INTEL_IOMMU_SVM
4856 .page_response = intel_svm_page_response,
4858 .default_domain_ops = &(const struct iommu_domain_ops) {
4859 .attach_dev = intel_iommu_attach_device,
4860 .set_dev_pasid = intel_iommu_set_dev_pasid,
4861 .map_pages = intel_iommu_map_pages,
4862 .unmap_pages = intel_iommu_unmap_pages,
4863 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4864 .flush_iotlb_all = intel_flush_iotlb_all,
4865 .iotlb_sync = intel_iommu_tlb_sync,
4866 .iova_to_phys = intel_iommu_iova_to_phys,
4867 .free = intel_iommu_domain_free,
4868 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4872 static void quirk_iommu_igfx(struct pci_dev *dev)
4874 if (risky_device(dev))
4877 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4881 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4884 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4885 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4886 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4887 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4890 /* Broadwell igfx malfunctions with dmar */
4891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4893 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4894 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4895 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4904 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4905 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4906 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4907 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4908 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4912 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4913 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4916 static void quirk_iommu_rwbf(struct pci_dev *dev)
4918 if (risky_device(dev))
4922 * Mobile 4 Series Chipset neglects to set RWBF capability,
4923 * but needs it. Same seems to hold for the desktop versions.
4925 pci_info(dev, "Forcing write-buffer flush capability\n");
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4938 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4939 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4940 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4941 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4942 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4943 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4944 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4945 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4947 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4951 if (risky_device(dev))
4954 if (pci_read_config_word(dev, GGC, &ggc))
4957 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4958 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4960 } else if (dmar_map_gfx) {
4961 /* we have to ensure the gfx device is idle before we flush */
4962 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4963 iommu_set_dma_strict();
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4968 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4969 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4971 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4975 if (!IS_GFX_DEVICE(dev))
4978 ver = (dev->device >> 8) & 0xff;
4979 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4980 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4981 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4984 if (risky_device(dev))
4987 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4988 iommu_skip_te_disable = 1;
4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4992 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4993 ISOCH DMAR unit for the Azalia sound device, but not give it any
4994 TLB entries, which causes it to deadlock. Check for that. We do
4995 this in a function called from init_dmars(), instead of in a PCI
4996 quirk, because we don't want to print the obnoxious "BIOS broken"
4997 message if VT-d is actually disabled.
4999 static void __init check_tylersburg_isoch(void)
5001 struct pci_dev *pdev;
5002 uint32_t vtisochctrl;
5004 /* If there's no Azalia in the system anyway, forget it. */
5005 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5009 if (risky_device(pdev)) {
5016 /* System Management Registers. Might be hidden, in which case
5017 we can't do the sanity check. But that's OK, because the
5018 known-broken BIOSes _don't_ actually hide it, so far. */
5019 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5023 if (risky_device(pdev)) {
5028 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5035 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5036 if (vtisochctrl & 1)
5039 /* Drop all bits other than the number of TLB entries */
5040 vtisochctrl &= 0x1c;
5042 /* If we have the recommended number of TLB entries (16), fine. */
5043 if (vtisochctrl == 0x10)
5046 /* Zero TLB entries? You get to ride the short bus to school. */
5048 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5049 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5050 dmi_get_system_info(DMI_BIOS_VENDOR),
5051 dmi_get_system_info(DMI_BIOS_VERSION),
5052 dmi_get_system_info(DMI_PRODUCT_VERSION));
5053 iommu_identity_mapping |= IDENTMAP_AZALIA;
5057 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5062 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5063 * invalidation completion before posted writes initiated with translated address
5064 * that utilized translations matching the invalidation address range, violating
5065 * the invalidation completion ordering.
5066 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5067 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5068 * under the control of the trusted/privileged host device driver must use this
5070 * Device TLBs are invalidated under the following six conditions:
5071 * 1. Device driver does DMA API unmap IOVA
5072 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5073 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5074 * exit_mmap() due to crash
5075 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5076 * VM has to free pages that were unmapped
5077 * 5. Userspace driver unmaps a DMA buffer
5078 * 6. Cache invalidation in vSVA usage (upcoming)
5080 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5081 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5082 * invalidate TLB the same way as normal user unmap which will use this quirk.
5083 * The dTLB invalidation after PASID cache flush does not need this quirk.
5085 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5087 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5088 unsigned long address, unsigned long mask,
5089 u32 pasid, u16 qdep)
5093 if (likely(!info->dtlb_extra_inval))
5096 sid = PCI_DEVID(info->bus, info->devfn);
5097 if (pasid == IOMMU_NO_PASID) {
5098 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5099 qdep, address, mask);
5101 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5102 pasid, qdep, address, mask);
5106 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5109 * Function to submit a command to the enhanced command interface. The
5110 * valid enhanced command descriptions are defined in Table 47 of the
5111 * VT-d spec. The VT-d hardware implementation may support some but not
5112 * all commands, which can be determined by checking the Enhanced
5113 * Command Capability Register.
5116 * - 0: Command successful without any error;
5117 * - Negative: software error value;
5118 * - Nonzero positive: failure status code defined in Table 48.
5120 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5122 unsigned long flags;
5126 if (!cap_ecmds(iommu->cap))
5129 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5131 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5132 if (res & DMA_ECMD_ECRSP_IP) {
5138 * Unconditionally write the operand B, because
5139 * - There is no side effect if an ecmd doesn't require an
5140 * operand B, but we set the register to some value.
5141 * - It's not invoked in any critical path. The extra MMIO
5142 * write doesn't bring any performance concerns.
5144 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5145 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5147 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5148 !(res & DMA_ECMD_ECRSP_IP), res);
5150 if (res & DMA_ECMD_ECRSP_IP) {
5155 ret = ecmd_get_status_code(res);
5157 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);