1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
32 #include "cap_audit.h"
35 #define ROOT_SIZE VTD_PAGE_SIZE
36 #define CONTEXT_SIZE VTD_PAGE_SIZE
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
43 #define IOAPIC_RANGE_START (0xfee00000)
44 #define IOAPIC_RANGE_END (0xfeefffff)
45 #define IOVA_START_ADDR (0x1000)
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
49 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
58 /* IO virtual address start page frame number */
59 #define IOVA_START_PFN (1)
61 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
63 static void __init check_tylersburg_isoch(void);
64 static int rwbf_quirk;
67 * set to 1 to panic kernel if can't successfully enable VT-d
68 * (used when kernel is launched w/ TXT)
70 static int force_on = 0;
71 static int intel_iommu_tboot_noforce;
72 static int no_platform_optin;
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
80 static phys_addr_t root_entry_lctp(struct root_entry *re)
85 return re->lo & VTD_PAGE_MASK;
89 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
92 static phys_addr_t root_entry_uctp(struct root_entry *re)
97 return re->hi & VTD_PAGE_MASK;
101 * This domain is a statically identity mapping domain.
102 * 1. This domain creats a static 1:1 mapping to all usable memory.
103 * 2. It maps to each iommu if successful.
104 * 3. Each iommu mapps to this domain if successful.
106 static struct dmar_domain *si_domain;
107 static int hw_pass_through = 1;
109 struct dmar_rmrr_unit {
110 struct list_head list; /* list of rmrr units */
111 struct acpi_dmar_header *hdr; /* ACPI header */
112 u64 base_address; /* reserved base address*/
113 u64 end_address; /* reserved end address */
114 struct dmar_dev_scope *devices; /* target devices */
115 int devices_cnt; /* target device count */
118 struct dmar_atsr_unit {
119 struct list_head list; /* list of ATSR units */
120 struct acpi_dmar_header *hdr; /* ACPI header */
121 struct dmar_dev_scope *devices; /* target devices */
122 int devices_cnt; /* target device count */
123 u8 include_all:1; /* include all ports */
126 struct dmar_satc_unit {
127 struct list_head list; /* list of SATC units */
128 struct acpi_dmar_header *hdr; /* ACPI header */
129 struct dmar_dev_scope *devices; /* target devices */
130 struct intel_iommu *iommu; /* the corresponding iommu */
131 int devices_cnt; /* target device count */
132 u8 atc_required:1; /* ATS is required */
135 static LIST_HEAD(dmar_atsr_units);
136 static LIST_HEAD(dmar_rmrr_units);
137 static LIST_HEAD(dmar_satc_units);
139 #define for_each_rmrr_units(rmrr) \
140 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
142 static void intel_iommu_domain_free(struct iommu_domain *domain);
144 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
145 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
147 int intel_iommu_enabled = 0;
148 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
150 static int dmar_map_gfx = 1;
151 static int intel_iommu_superpage = 1;
152 static int iommu_identity_mapping;
153 static int iommu_skip_te_disable;
155 #define IDENTMAP_GFX 2
156 #define IDENTMAP_AZALIA 4
158 const struct iommu_ops intel_iommu_ops;
159 static const struct iommu_dirty_ops intel_dirty_ops;
161 static bool translation_pre_enabled(struct intel_iommu *iommu)
163 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
166 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
168 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
171 static void init_translation_status(struct intel_iommu *iommu)
175 gsts = readl(iommu->reg + DMAR_GSTS_REG);
176 if (gsts & DMA_GSTS_TES)
177 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
180 static int __init intel_iommu_setup(char *str)
186 if (!strncmp(str, "on", 2)) {
188 pr_info("IOMMU enabled\n");
189 } else if (!strncmp(str, "off", 3)) {
191 no_platform_optin = 1;
192 pr_info("IOMMU disabled\n");
193 } else if (!strncmp(str, "igfx_off", 8)) {
195 pr_info("Disable GFX device mapping\n");
196 } else if (!strncmp(str, "forcedac", 8)) {
197 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
198 iommu_dma_forcedac = true;
199 } else if (!strncmp(str, "strict", 6)) {
200 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
201 iommu_set_dma_strict();
202 } else if (!strncmp(str, "sp_off", 6)) {
203 pr_info("Disable supported super page\n");
204 intel_iommu_superpage = 0;
205 } else if (!strncmp(str, "sm_on", 5)) {
206 pr_info("Enable scalable mode if hardware supports\n");
208 } else if (!strncmp(str, "sm_off", 6)) {
209 pr_info("Scalable mode is disallowed\n");
211 } else if (!strncmp(str, "tboot_noforce", 13)) {
212 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
213 intel_iommu_tboot_noforce = 1;
215 pr_notice("Unknown option - '%s'\n", str);
218 str += strcspn(str, ",");
225 __setup("intel_iommu=", intel_iommu_setup);
227 void *alloc_pgtable_page(int node, gfp_t gfp)
232 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
234 vaddr = page_address(page);
238 void free_pgtable_page(void *vaddr)
240 free_page((unsigned long)vaddr);
243 static int domain_type_is_si(struct dmar_domain *domain)
245 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
248 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
250 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
252 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
256 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
257 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
258 * the returned SAGAW.
260 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
262 unsigned long fl_sagaw, sl_sagaw;
264 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
265 sl_sagaw = cap_sagaw(iommu->cap);
267 /* Second level only. */
268 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
271 /* First level only. */
272 if (!ecap_slts(iommu->ecap))
275 return fl_sagaw & sl_sagaw;
278 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
283 sagaw = __iommu_calculate_sagaw(iommu);
284 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
285 if (test_bit(agaw, &sagaw))
293 * Calculate max SAGAW for each iommu.
295 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
297 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
301 * calculate agaw for each iommu.
302 * "SAGAW" may be different across iommus, use a default agaw, and
303 * get a supported less agaw for iommus that don't support the default agaw.
305 int iommu_calculate_agaw(struct intel_iommu *iommu)
307 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
310 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
312 return sm_supported(iommu) ?
313 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
316 static void domain_update_iommu_coherency(struct dmar_domain *domain)
318 struct iommu_domain_info *info;
319 struct dmar_drhd_unit *drhd;
320 struct intel_iommu *iommu;
324 domain->iommu_coherency = true;
325 xa_for_each(&domain->iommu_array, i, info) {
327 if (!iommu_paging_structure_coherency(info->iommu)) {
328 domain->iommu_coherency = false;
335 /* No hardware attached; use lowest common denominator */
337 for_each_active_iommu(iommu, drhd) {
338 if (!iommu_paging_structure_coherency(iommu)) {
339 domain->iommu_coherency = false;
346 static int domain_update_iommu_superpage(struct dmar_domain *domain,
347 struct intel_iommu *skip)
349 struct dmar_drhd_unit *drhd;
350 struct intel_iommu *iommu;
353 if (!intel_iommu_superpage)
356 /* set iommu_superpage to the smallest common denominator */
358 for_each_active_iommu(iommu, drhd) {
360 if (domain && domain->use_first_level) {
361 if (!cap_fl1gp_support(iommu->cap))
364 mask &= cap_super_page_val(iommu->cap);
376 static int domain_update_device_node(struct dmar_domain *domain)
378 struct device_domain_info *info;
379 int nid = NUMA_NO_NODE;
382 spin_lock_irqsave(&domain->lock, flags);
383 list_for_each_entry(info, &domain->devices, link) {
385 * There could possibly be multiple device numa nodes as devices
386 * within the same domain may sit behind different IOMMUs. There
387 * isn't perfect answer in such situation, so we select first
388 * come first served policy.
390 nid = dev_to_node(info->dev);
391 if (nid != NUMA_NO_NODE)
394 spin_unlock_irqrestore(&domain->lock, flags);
399 static void domain_update_iotlb(struct dmar_domain *domain);
401 /* Return the super pagesize bitmap if supported. */
402 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
404 unsigned long bitmap = 0;
407 * 1-level super page supports page size of 2MiB, 2-level super page
408 * supports page size of both 2MiB and 1GiB.
410 if (domain->iommu_superpage == 1)
412 else if (domain->iommu_superpage == 2)
413 bitmap |= SZ_2M | SZ_1G;
418 /* Some capabilities may be different across iommus */
419 void domain_update_iommu_cap(struct dmar_domain *domain)
421 domain_update_iommu_coherency(domain);
422 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
425 * If RHSA is missing, we should default to the device numa domain
428 if (domain->nid == NUMA_NO_NODE)
429 domain->nid = domain_update_device_node(domain);
432 * First-level translation restricts the input-address to a
433 * canonical address (i.e., address bits 63:N have the same
434 * value as address bit [N-1], where N is 48-bits with 4-level
435 * paging and 57-bits with 5-level paging). Hence, skip bit
438 if (domain->use_first_level)
439 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
441 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
443 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
444 domain_update_iotlb(domain);
447 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
450 struct root_entry *root = &iommu->root_entry[bus];
451 struct context_entry *context;
455 * Except that the caller requested to allocate a new entry,
456 * returning a copied context entry makes no sense.
458 if (!alloc && context_copied(iommu, bus, devfn))
462 if (sm_supported(iommu)) {
470 context = phys_to_virt(*entry & VTD_PAGE_MASK);
472 unsigned long phy_addr;
476 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
480 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
481 phy_addr = virt_to_phys((void *)context);
482 *entry = phy_addr | 1;
483 __iommu_flush_cache(iommu, entry, sizeof(*entry));
485 return &context[devfn];
489 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
490 * sub-hierarchy of a candidate PCI-PCI bridge
491 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
492 * @bridge: the candidate PCI-PCI bridge
494 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
497 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
499 struct pci_dev *pdev, *pbridge;
501 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
504 pdev = to_pci_dev(dev);
505 pbridge = to_pci_dev(bridge);
507 if (pbridge->subordinate &&
508 pbridge->subordinate->number <= pdev->bus->number &&
509 pbridge->subordinate->busn_res.end >= pdev->bus->number)
515 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
517 struct dmar_drhd_unit *drhd;
521 /* We know that this device on this chipset has its own IOMMU.
522 * If we find it under a different IOMMU, then the BIOS is lying
523 * to us. Hope that the IOMMU for this device is actually
524 * disabled, and it needs no translation...
526 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
529 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
534 /* we know that the this iommu should be at offset 0xa000 from vtbar */
535 drhd = dmar_find_matched_drhd_unit(pdev);
536 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
537 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
538 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
545 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
547 if (!iommu || iommu->drhd->ignored)
550 if (dev_is_pci(dev)) {
551 struct pci_dev *pdev = to_pci_dev(dev);
553 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
554 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
555 quirk_ioat_snb_local_iommu(pdev))
562 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
564 struct dmar_drhd_unit *drhd = NULL;
565 struct pci_dev *pdev = NULL;
566 struct intel_iommu *iommu;
574 if (dev_is_pci(dev)) {
575 struct pci_dev *pf_pdev;
577 pdev = pci_real_dma_dev(to_pci_dev(dev));
579 /* VFs aren't listed in scope tables; we need to look up
580 * the PF instead to find the IOMMU. */
581 pf_pdev = pci_physfn(pdev);
583 segment = pci_domain_nr(pdev->bus);
584 } else if (has_acpi_companion(dev))
585 dev = &ACPI_COMPANION(dev)->dev;
588 for_each_iommu(iommu, drhd) {
589 if (pdev && segment != drhd->segment)
592 for_each_active_dev_scope(drhd->devices,
593 drhd->devices_cnt, i, tmp) {
595 /* For a VF use its original BDF# not that of the PF
596 * which we used for the IOMMU lookup. Strictly speaking
597 * we could do this for all PCI devices; we only need to
598 * get the BDF# from the scope table for ACPI matches. */
599 if (pdev && pdev->is_virtfn)
603 *bus = drhd->devices[i].bus;
604 *devfn = drhd->devices[i].devfn;
609 if (is_downstream_to_pci_bridge(dev, tmp))
613 if (pdev && drhd->include_all) {
616 *bus = pdev->bus->number;
617 *devfn = pdev->devfn;
624 if (iommu_is_dummy(iommu, dev))
632 static void domain_flush_cache(struct dmar_domain *domain,
633 void *addr, int size)
635 if (!domain->iommu_coherency)
636 clflush_cache_range(addr, size);
639 static void free_context_table(struct intel_iommu *iommu)
641 struct context_entry *context;
644 if (!iommu->root_entry)
647 for (i = 0; i < ROOT_ENTRY_NR; i++) {
648 context = iommu_context_addr(iommu, i, 0, 0);
650 free_pgtable_page(context);
652 if (!sm_supported(iommu))
655 context = iommu_context_addr(iommu, i, 0x80, 0);
657 free_pgtable_page(context);
660 free_pgtable_page(iommu->root_entry);
661 iommu->root_entry = NULL;
664 #ifdef CONFIG_DMAR_DEBUG
665 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
666 u8 bus, u8 devfn, struct dma_pte *parent, int level)
672 offset = pfn_level_offset(pfn, level);
673 pte = &parent[offset];
674 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
675 pr_info("PTE not present at level %d\n", level);
679 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
684 parent = phys_to_virt(dma_pte_addr(pte));
689 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
690 unsigned long long addr, u32 pasid)
692 struct pasid_dir_entry *dir, *pde;
693 struct pasid_entry *entries, *pte;
694 struct context_entry *ctx_entry;
695 struct root_entry *rt_entry;
696 int i, dir_index, index, level;
697 u8 devfn = source_id & 0xff;
698 u8 bus = source_id >> 8;
699 struct dma_pte *pgtable;
701 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
703 /* root entry dump */
704 rt_entry = &iommu->root_entry[bus];
706 pr_info("root table entry is not present\n");
710 if (sm_supported(iommu))
711 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
712 rt_entry->hi, rt_entry->lo);
714 pr_info("root entry: 0x%016llx", rt_entry->lo);
716 /* context entry dump */
717 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
719 pr_info("context table entry is not present\n");
723 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
724 ctx_entry->hi, ctx_entry->lo);
726 /* legacy mode does not require PASID entries */
727 if (!sm_supported(iommu)) {
728 level = agaw_to_level(ctx_entry->hi & 7);
729 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
733 /* get the pointer to pasid directory entry */
734 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
736 pr_info("pasid directory entry is not present\n");
739 /* For request-without-pasid, get the pasid from context entry */
740 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
741 pasid = IOMMU_NO_PASID;
743 dir_index = pasid >> PASID_PDE_SHIFT;
744 pde = &dir[dir_index];
745 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
747 /* get the pointer to the pasid table entry */
748 entries = get_pasid_table_from_pde(pde);
750 pr_info("pasid table entry is not present\n");
753 index = pasid & PASID_PTE_MASK;
754 pte = &entries[index];
755 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
756 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
758 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
759 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
760 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
762 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
763 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
767 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
771 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
772 unsigned long pfn, int *target_level,
775 struct dma_pte *parent, *pte;
776 int level = agaw_to_level(domain->agaw);
779 if (!domain_pfn_supported(domain, pfn))
780 /* Address beyond IOMMU's addressing capabilities. */
783 parent = domain->pgd;
788 offset = pfn_level_offset(pfn, level);
789 pte = &parent[offset];
790 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
792 if (level == *target_level)
795 if (!dma_pte_present(pte)) {
798 tmp_page = alloc_pgtable_page(domain->nid, gfp);
803 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
804 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
805 if (domain->use_first_level)
806 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
808 if (cmpxchg64(&pte->val, 0ULL, pteval))
809 /* Someone else set it while we were thinking; use theirs. */
810 free_pgtable_page(tmp_page);
812 domain_flush_cache(domain, pte, sizeof(*pte));
817 parent = phys_to_virt(dma_pte_addr(pte));
822 *target_level = level;
827 /* return address's pte at specific level */
828 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
830 int level, int *large_page)
832 struct dma_pte *parent, *pte;
833 int total = agaw_to_level(domain->agaw);
836 parent = domain->pgd;
837 while (level <= total) {
838 offset = pfn_level_offset(pfn, total);
839 pte = &parent[offset];
843 if (!dma_pte_present(pte)) {
848 if (dma_pte_superpage(pte)) {
853 parent = phys_to_virt(dma_pte_addr(pte));
859 /* clear last level pte, a tlb flush should be followed */
860 static void dma_pte_clear_range(struct dmar_domain *domain,
861 unsigned long start_pfn,
862 unsigned long last_pfn)
864 unsigned int large_page;
865 struct dma_pte *first_pte, *pte;
867 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
868 WARN_ON(start_pfn > last_pfn))
871 /* we don't need lock here; nobody else touches the iova range */
874 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
876 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
881 start_pfn += lvl_to_nr_pages(large_page);
883 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
885 domain_flush_cache(domain, first_pte,
886 (void *)pte - (void *)first_pte);
888 } while (start_pfn && start_pfn <= last_pfn);
891 static void dma_pte_free_level(struct dmar_domain *domain, int level,
892 int retain_level, struct dma_pte *pte,
893 unsigned long pfn, unsigned long start_pfn,
894 unsigned long last_pfn)
896 pfn = max(start_pfn, pfn);
897 pte = &pte[pfn_level_offset(pfn, level)];
900 unsigned long level_pfn;
901 struct dma_pte *level_pte;
903 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
906 level_pfn = pfn & level_mask(level);
907 level_pte = phys_to_virt(dma_pte_addr(pte));
910 dma_pte_free_level(domain, level - 1, retain_level,
911 level_pte, level_pfn, start_pfn,
916 * Free the page table if we're below the level we want to
917 * retain and the range covers the entire table.
919 if (level < retain_level && !(start_pfn > level_pfn ||
920 last_pfn < level_pfn + level_size(level) - 1)) {
922 domain_flush_cache(domain, pte, sizeof(*pte));
923 free_pgtable_page(level_pte);
926 pfn += level_size(level);
927 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
931 * clear last level (leaf) ptes and free page table pages below the
932 * level we wish to keep intact.
934 static void dma_pte_free_pagetable(struct dmar_domain *domain,
935 unsigned long start_pfn,
936 unsigned long last_pfn,
939 dma_pte_clear_range(domain, start_pfn, last_pfn);
941 /* We don't need lock here; nobody else touches the iova range */
942 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
943 domain->pgd, 0, start_pfn, last_pfn);
946 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
947 free_pgtable_page(domain->pgd);
952 /* When a page at a given level is being unlinked from its parent, we don't
953 need to *modify* it at all. All we need to do is make a list of all the
954 pages which can be freed just as soon as we've flushed the IOTLB and we
955 know the hardware page-walk will no longer touch them.
956 The 'pte' argument is the *parent* PTE, pointing to the page that is to
958 static void dma_pte_list_pagetables(struct dmar_domain *domain,
959 int level, struct dma_pte *pte,
960 struct list_head *freelist)
964 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
965 list_add_tail(&pg->lru, freelist);
970 pte = page_address(pg);
972 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
973 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
975 } while (!first_pte_in_page(pte));
978 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
979 struct dma_pte *pte, unsigned long pfn,
980 unsigned long start_pfn, unsigned long last_pfn,
981 struct list_head *freelist)
983 struct dma_pte *first_pte = NULL, *last_pte = NULL;
985 pfn = max(start_pfn, pfn);
986 pte = &pte[pfn_level_offset(pfn, level)];
989 unsigned long level_pfn = pfn & level_mask(level);
991 if (!dma_pte_present(pte))
994 /* If range covers entire pagetable, free it */
995 if (start_pfn <= level_pfn &&
996 last_pfn >= level_pfn + level_size(level) - 1) {
997 /* These suborbinate page tables are going away entirely. Don't
998 bother to clear them; we're just going to *free* them. */
999 if (level > 1 && !dma_pte_superpage(pte))
1000 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1006 } else if (level > 1) {
1007 /* Recurse down into a level that isn't *entirely* obsolete */
1008 dma_pte_clear_level(domain, level - 1,
1009 phys_to_virt(dma_pte_addr(pte)),
1010 level_pfn, start_pfn, last_pfn,
1014 pfn = level_pfn + level_size(level);
1015 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1018 domain_flush_cache(domain, first_pte,
1019 (void *)++last_pte - (void *)first_pte);
1022 /* We can't just free the pages because the IOMMU may still be walking
1023 the page tables, and may have cached the intermediate levels. The
1024 pages can only be freed after the IOTLB flush has been done. */
1025 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1026 unsigned long last_pfn, struct list_head *freelist)
1028 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1029 WARN_ON(start_pfn > last_pfn))
1032 /* we don't need lock here; nobody else touches the iova range */
1033 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1034 domain->pgd, 0, start_pfn, last_pfn, freelist);
1037 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1038 struct page *pgd_page = virt_to_page(domain->pgd);
1039 list_add_tail(&pgd_page->lru, freelist);
1044 /* iommu handling */
1045 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1047 struct root_entry *root;
1049 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1051 pr_err("Allocating root entry for %s failed\n",
1056 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1057 iommu->root_entry = root;
1062 static void iommu_set_root_entry(struct intel_iommu *iommu)
1068 addr = virt_to_phys(iommu->root_entry);
1069 if (sm_supported(iommu))
1070 addr |= DMA_RTADDR_SMT;
1072 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1075 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1077 /* Make sure hardware complete it */
1078 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1079 readl, (sts & DMA_GSTS_RTPS), sts);
1081 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1084 * Hardware invalidates all DMA remapping hardware translation
1085 * caches as part of SRTP flow.
1087 if (cap_esrtps(iommu->cap))
1090 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1091 if (sm_supported(iommu))
1092 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1093 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1096 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1101 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1104 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1105 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1107 /* Make sure hardware complete it */
1108 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1109 readl, (!(val & DMA_GSTS_WBFS)), val);
1111 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1114 /* return value determine if we need a write buffer flush */
1115 static void __iommu_flush_context(struct intel_iommu *iommu,
1116 u16 did, u16 source_id, u8 function_mask,
1123 case DMA_CCMD_GLOBAL_INVL:
1124 val = DMA_CCMD_GLOBAL_INVL;
1126 case DMA_CCMD_DOMAIN_INVL:
1127 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1129 case DMA_CCMD_DEVICE_INVL:
1130 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1131 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1134 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1138 val |= DMA_CCMD_ICC;
1140 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1141 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1143 /* Make sure hardware complete it */
1144 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1145 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1147 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1150 /* return value determine if we need a write buffer flush */
1151 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1152 u64 addr, unsigned int size_order, u64 type)
1154 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1155 u64 val = 0, val_iva = 0;
1159 case DMA_TLB_GLOBAL_FLUSH:
1160 /* global flush doesn't need set IVA_REG */
1161 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1163 case DMA_TLB_DSI_FLUSH:
1164 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1166 case DMA_TLB_PSI_FLUSH:
1167 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1168 /* IH bit is passed in as part of address */
1169 val_iva = size_order | addr;
1172 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1177 if (cap_write_drain(iommu->cap))
1178 val |= DMA_TLB_WRITE_DRAIN;
1180 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1181 /* Note: Only uses first TLB reg currently */
1183 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1184 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1186 /* Make sure hardware complete it */
1187 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1188 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1190 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1192 /* check IOTLB invalidation granularity */
1193 if (DMA_TLB_IAIG(val) == 0)
1194 pr_err("Flush IOTLB failed\n");
1195 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1196 pr_debug("TLB flush request %Lx, actual %Lx\n",
1197 (unsigned long long)DMA_TLB_IIRG(type),
1198 (unsigned long long)DMA_TLB_IAIG(val));
1201 static struct device_domain_info *
1202 domain_lookup_dev_info(struct dmar_domain *domain,
1203 struct intel_iommu *iommu, u8 bus, u8 devfn)
1205 struct device_domain_info *info;
1206 unsigned long flags;
1208 spin_lock_irqsave(&domain->lock, flags);
1209 list_for_each_entry(info, &domain->devices, link) {
1210 if (info->iommu == iommu && info->bus == bus &&
1211 info->devfn == devfn) {
1212 spin_unlock_irqrestore(&domain->lock, flags);
1216 spin_unlock_irqrestore(&domain->lock, flags);
1221 static void domain_update_iotlb(struct dmar_domain *domain)
1223 struct dev_pasid_info *dev_pasid;
1224 struct device_domain_info *info;
1225 bool has_iotlb_device = false;
1226 unsigned long flags;
1228 spin_lock_irqsave(&domain->lock, flags);
1229 list_for_each_entry(info, &domain->devices, link) {
1230 if (info->ats_enabled) {
1231 has_iotlb_device = true;
1236 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1237 info = dev_iommu_priv_get(dev_pasid->dev);
1238 if (info->ats_enabled) {
1239 has_iotlb_device = true;
1243 domain->has_iotlb_device = has_iotlb_device;
1244 spin_unlock_irqrestore(&domain->lock, flags);
1248 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1249 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1250 * check because it applies only to the built-in QAT devices and it doesn't
1251 * grant additional privileges.
1253 #define BUGGY_QAT_DEVID_MASK 0x4940
1254 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1256 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1259 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1265 static void iommu_enable_pci_caps(struct device_domain_info *info)
1267 struct pci_dev *pdev;
1269 if (!dev_is_pci(info->dev))
1272 pdev = to_pci_dev(info->dev);
1274 /* The PCIe spec, in its wisdom, declares that the behaviour of
1275 the device if you enable PASID support after ATS support is
1276 undefined. So always enable PASID support on devices which
1277 have it, even if we can't yet know if we're ever going to
1279 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1280 info->pasid_enabled = 1;
1282 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1283 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1284 info->ats_enabled = 1;
1285 domain_update_iotlb(info->domain);
1289 static void iommu_disable_pci_caps(struct device_domain_info *info)
1291 struct pci_dev *pdev;
1293 if (!dev_is_pci(info->dev))
1296 pdev = to_pci_dev(info->dev);
1298 if (info->ats_enabled) {
1299 pci_disable_ats(pdev);
1300 info->ats_enabled = 0;
1301 domain_update_iotlb(info->domain);
1304 if (info->pasid_enabled) {
1305 pci_disable_pasid(pdev);
1306 info->pasid_enabled = 0;
1310 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1311 u64 addr, unsigned int mask)
1315 if (!info || !info->ats_enabled)
1318 sid = info->bus << 8 | info->devfn;
1319 qdep = info->ats_qdep;
1320 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1322 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1325 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1326 u64 addr, unsigned mask)
1328 struct dev_pasid_info *dev_pasid;
1329 struct device_domain_info *info;
1330 unsigned long flags;
1332 if (!domain->has_iotlb_device)
1335 spin_lock_irqsave(&domain->lock, flags);
1336 list_for_each_entry(info, &domain->devices, link)
1337 __iommu_flush_dev_iotlb(info, addr, mask);
1339 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1340 info = dev_iommu_priv_get(dev_pasid->dev);
1342 if (!info->ats_enabled)
1345 qi_flush_dev_iotlb_pasid(info->iommu,
1346 PCI_DEVID(info->bus, info->devfn),
1347 info->pfsid, dev_pasid->pasid,
1348 info->ats_qdep, addr,
1351 spin_unlock_irqrestore(&domain->lock, flags);
1354 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1355 struct dmar_domain *domain, u64 addr,
1356 unsigned long npages, bool ih)
1358 u16 did = domain_id_iommu(domain, iommu);
1359 struct dev_pasid_info *dev_pasid;
1360 unsigned long flags;
1362 spin_lock_irqsave(&domain->lock, flags);
1363 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1364 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1366 if (!list_empty(&domain->devices))
1367 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1368 spin_unlock_irqrestore(&domain->lock, flags);
1371 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1372 struct dmar_domain *domain,
1373 unsigned long pfn, unsigned int pages,
1376 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1377 unsigned int mask = ilog2(aligned_pages);
1378 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1379 u16 did = domain_id_iommu(domain, iommu);
1381 if (WARN_ON(!pages))
1387 if (domain->use_first_level) {
1388 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1390 unsigned long bitmask = aligned_pages - 1;
1393 * PSI masks the low order bits of the base address. If the
1394 * address isn't aligned to the mask, then compute a mask value
1395 * needed to ensure the target range is flushed.
1397 if (unlikely(bitmask & pfn)) {
1398 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1401 * Since end_pfn <= pfn + bitmask, the only way bits
1402 * higher than bitmask can differ in pfn and end_pfn is
1403 * by carrying. This means after masking out bitmask,
1404 * high bits starting with the first set bit in
1405 * shared_bits are all equal in both pfn and end_pfn.
1407 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1408 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1412 * Fallback to domain selective flush if no PSI support or
1413 * the size is too big.
1415 if (!cap_pgsel_inv(iommu->cap) ||
1416 mask > cap_max_amask_val(iommu->cap))
1417 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1420 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1425 * In caching mode, changes of pages from non-present to present require
1426 * flush. However, device IOTLB doesn't need to be flushed in this case.
1428 if (!cap_caching_mode(iommu->cap) || !map)
1429 iommu_flush_dev_iotlb(domain, addr, mask);
1432 /* Notification for newly created mappings */
1433 static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1434 unsigned long pfn, unsigned int pages)
1437 * It's a non-present to present mapping. Only flush if caching mode
1440 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1441 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1443 iommu_flush_write_buffer(iommu);
1446 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1448 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1449 struct iommu_domain_info *info;
1452 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1453 struct intel_iommu *iommu = info->iommu;
1454 u16 did = domain_id_iommu(dmar_domain, iommu);
1456 if (dmar_domain->use_first_level)
1457 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1459 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1462 if (!cap_caching_mode(iommu->cap))
1463 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1467 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1470 unsigned long flags;
1472 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1475 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1476 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1477 pmen &= ~DMA_PMEN_EPM;
1478 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1480 /* wait for the protected region status bit to clear */
1481 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1482 readl, !(pmen & DMA_PMEN_PRS), pmen);
1484 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1487 static void iommu_enable_translation(struct intel_iommu *iommu)
1490 unsigned long flags;
1492 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1493 iommu->gcmd |= DMA_GCMD_TE;
1494 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1496 /* Make sure hardware complete it */
1497 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1498 readl, (sts & DMA_GSTS_TES), sts);
1500 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1503 static void iommu_disable_translation(struct intel_iommu *iommu)
1508 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1509 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1512 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1513 iommu->gcmd &= ~DMA_GCMD_TE;
1514 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1516 /* Make sure hardware complete it */
1517 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1518 readl, (!(sts & DMA_GSTS_TES)), sts);
1520 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1523 static int iommu_init_domains(struct intel_iommu *iommu)
1527 ndomains = cap_ndoms(iommu->cap);
1528 pr_debug("%s: Number of Domains supported <%d>\n",
1529 iommu->name, ndomains);
1531 spin_lock_init(&iommu->lock);
1533 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1534 if (!iommu->domain_ids)
1538 * If Caching mode is set, then invalid translations are tagged
1539 * with domain-id 0, hence we need to pre-allocate it. We also
1540 * use domain-id 0 as a marker for non-allocated domain-id, so
1541 * make sure it is not used for a real domain.
1543 set_bit(0, iommu->domain_ids);
1546 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1547 * entry for first-level or pass-through translation modes should
1548 * be programmed with a domain id different from those used for
1549 * second-level or nested translation. We reserve a domain id for
1552 if (sm_supported(iommu))
1553 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1558 static void disable_dmar_iommu(struct intel_iommu *iommu)
1560 if (!iommu->domain_ids)
1564 * All iommu domains must have been detached from the devices,
1565 * hence there should be no domain IDs in use.
1567 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1568 > NUM_RESERVED_DID))
1571 if (iommu->gcmd & DMA_GCMD_TE)
1572 iommu_disable_translation(iommu);
1575 static void free_dmar_iommu(struct intel_iommu *iommu)
1577 if (iommu->domain_ids) {
1578 bitmap_free(iommu->domain_ids);
1579 iommu->domain_ids = NULL;
1582 if (iommu->copied_tables) {
1583 bitmap_free(iommu->copied_tables);
1584 iommu->copied_tables = NULL;
1587 /* free context mapping */
1588 free_context_table(iommu);
1590 #ifdef CONFIG_INTEL_IOMMU_SVM
1591 if (pasid_supported(iommu)) {
1592 if (ecap_prs(iommu->ecap))
1593 intel_svm_finish_prq(iommu);
1599 * Check and return whether first level is used by default for
1602 static bool first_level_by_default(unsigned int type)
1604 /* Only SL is available in legacy mode */
1605 if (!scalable_mode_support())
1608 /* Only level (either FL or SL) is available, just use it */
1609 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1610 return intel_cap_flts_sanity();
1612 /* Both levels are available, decide it based on domain type */
1613 return type != IOMMU_DOMAIN_UNMANAGED;
1616 static struct dmar_domain *alloc_domain(unsigned int type)
1618 struct dmar_domain *domain;
1620 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1624 domain->nid = NUMA_NO_NODE;
1625 if (first_level_by_default(type))
1626 domain->use_first_level = true;
1627 domain->has_iotlb_device = false;
1628 INIT_LIST_HEAD(&domain->devices);
1629 INIT_LIST_HEAD(&domain->dev_pasids);
1630 spin_lock_init(&domain->lock);
1631 xa_init(&domain->iommu_array);
1636 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1638 struct iommu_domain_info *info, *curr;
1639 unsigned long ndomains;
1640 int num, ret = -ENOSPC;
1642 info = kzalloc(sizeof(*info), GFP_KERNEL);
1646 spin_lock(&iommu->lock);
1647 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1650 spin_unlock(&iommu->lock);
1655 ndomains = cap_ndoms(iommu->cap);
1656 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1657 if (num >= ndomains) {
1658 pr_err("%s: No free domain ids\n", iommu->name);
1662 set_bit(num, iommu->domain_ids);
1665 info->iommu = iommu;
1666 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1667 NULL, info, GFP_ATOMIC);
1669 ret = xa_err(curr) ? : -EBUSY;
1672 domain_update_iommu_cap(domain);
1674 spin_unlock(&iommu->lock);
1678 clear_bit(info->did, iommu->domain_ids);
1680 spin_unlock(&iommu->lock);
1685 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1687 struct iommu_domain_info *info;
1689 spin_lock(&iommu->lock);
1690 info = xa_load(&domain->iommu_array, iommu->seq_id);
1691 if (--info->refcnt == 0) {
1692 clear_bit(info->did, iommu->domain_ids);
1693 xa_erase(&domain->iommu_array, iommu->seq_id);
1694 domain->nid = NUMA_NO_NODE;
1695 domain_update_iommu_cap(domain);
1698 spin_unlock(&iommu->lock);
1701 static int guestwidth_to_adjustwidth(int gaw)
1704 int r = (gaw - 12) % 9;
1715 static void domain_exit(struct dmar_domain *domain)
1718 LIST_HEAD(freelist);
1720 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1721 put_pages_list(&freelist);
1724 if (WARN_ON(!list_empty(&domain->devices)))
1731 * Get the PASID directory size for scalable mode context entry.
1732 * Value of X in the PDTS field of a scalable mode context entry
1733 * indicates PASID directory with 2^(X + 7) entries.
1735 static unsigned long context_get_sm_pds(struct pasid_table *table)
1737 unsigned long pds, max_pde;
1739 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1740 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1747 static int domain_context_mapping_one(struct dmar_domain *domain,
1748 struct intel_iommu *iommu,
1749 struct pasid_table *table,
1752 struct device_domain_info *info =
1753 domain_lookup_dev_info(domain, iommu, bus, devfn);
1754 u16 did = domain_id_iommu(domain, iommu);
1755 int translation = CONTEXT_TT_MULTI_LEVEL;
1756 struct context_entry *context;
1759 if (hw_pass_through && domain_type_is_si(domain))
1760 translation = CONTEXT_TT_PASS_THROUGH;
1762 pr_debug("Set context mapping for %02x:%02x.%d\n",
1763 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1765 spin_lock(&iommu->lock);
1767 context = iommu_context_addr(iommu, bus, devfn, 1);
1772 if (context_present(context) && !context_copied(iommu, bus, devfn))
1776 * For kdump cases, old valid entries may be cached due to the
1777 * in-flight DMA and copied pgtable, but there is no unmapping
1778 * behaviour for them, thus we need an explicit cache flush for
1779 * the newly-mapped device. For kdump, at this point, the device
1780 * is supposed to finish reset at its driver probe stage, so no
1781 * in-flight DMA will exist, and we don't need to worry anymore
1784 if (context_copied(iommu, bus, devfn)) {
1785 u16 did_old = context_domain_id(context);
1787 if (did_old < cap_ndoms(iommu->cap)) {
1788 iommu->flush.flush_context(iommu, did_old,
1789 (((u16)bus) << 8) | devfn,
1790 DMA_CCMD_MASK_NOBIT,
1791 DMA_CCMD_DEVICE_INVL);
1792 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1796 clear_context_copied(iommu, bus, devfn);
1799 context_clear_entry(context);
1801 if (sm_supported(iommu)) {
1804 /* Setup the PASID DIR pointer: */
1805 pds = context_get_sm_pds(table);
1806 context->lo = (u64)virt_to_phys(table->table) |
1809 /* Setup the RID_PASID field: */
1810 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1813 * Setup the Device-TLB enable bit and Page request
1816 if (info && info->ats_supported)
1817 context_set_sm_dte(context);
1818 if (info && info->pri_supported)
1819 context_set_sm_pre(context);
1820 if (info && info->pasid_supported)
1821 context_set_pasid(context);
1823 struct dma_pte *pgd = domain->pgd;
1826 context_set_domain_id(context, did);
1828 if (translation != CONTEXT_TT_PASS_THROUGH) {
1830 * Skip top levels of page tables for iommu which has
1831 * less agaw than default. Unnecessary for PT mode.
1833 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1835 pgd = phys_to_virt(dma_pte_addr(pgd));
1836 if (!dma_pte_present(pgd))
1840 if (info && info->ats_supported)
1841 translation = CONTEXT_TT_DEV_IOTLB;
1843 translation = CONTEXT_TT_MULTI_LEVEL;
1845 context_set_address_root(context, virt_to_phys(pgd));
1846 context_set_address_width(context, agaw);
1849 * In pass through mode, AW must be programmed to
1850 * indicate the largest AGAW value supported by
1851 * hardware. And ASR is ignored by hardware.
1853 context_set_address_width(context, iommu->msagaw);
1856 context_set_translation_type(context, translation);
1859 context_set_fault_enable(context);
1860 context_set_present(context);
1861 if (!ecap_coherent(iommu->ecap))
1862 clflush_cache_range(context, sizeof(*context));
1865 * It's a non-present to present mapping. If hardware doesn't cache
1866 * non-present entry we only need to flush the write-buffer. If the
1867 * _does_ cache non-present entries, then it does so in the special
1868 * domain #0, which we have to flush:
1870 if (cap_caching_mode(iommu->cap)) {
1871 iommu->flush.flush_context(iommu, 0,
1872 (((u16)bus) << 8) | devfn,
1873 DMA_CCMD_MASK_NOBIT,
1874 DMA_CCMD_DEVICE_INVL);
1875 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1877 iommu_flush_write_buffer(iommu);
1883 spin_unlock(&iommu->lock);
1888 struct domain_context_mapping_data {
1889 struct dmar_domain *domain;
1890 struct intel_iommu *iommu;
1891 struct pasid_table *table;
1894 static int domain_context_mapping_cb(struct pci_dev *pdev,
1895 u16 alias, void *opaque)
1897 struct domain_context_mapping_data *data = opaque;
1899 return domain_context_mapping_one(data->domain, data->iommu,
1900 data->table, PCI_BUS_NUM(alias),
1905 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1907 struct device_domain_info *info = dev_iommu_priv_get(dev);
1908 struct domain_context_mapping_data data;
1909 struct intel_iommu *iommu = info->iommu;
1910 u8 bus = info->bus, devfn = info->devfn;
1911 struct pasid_table *table;
1913 table = intel_pasid_get_table(dev);
1915 if (!dev_is_pci(dev))
1916 return domain_context_mapping_one(domain, iommu, table,
1919 data.domain = domain;
1923 return pci_for_each_dma_alias(to_pci_dev(dev),
1924 &domain_context_mapping_cb, &data);
1927 /* Returns a number of VTD pages, but aligned to MM page size */
1928 static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1930 host_addr &= ~PAGE_MASK;
1931 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1934 /* Return largest possible superpage level for a given mapping */
1935 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1936 unsigned long phy_pfn, unsigned long pages)
1938 int support, level = 1;
1939 unsigned long pfnmerge;
1941 support = domain->iommu_superpage;
1943 /* To use a large page, the virtual *and* physical addresses
1944 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1945 of them will mean we have to use smaller pages. So just
1946 merge them and check both at once. */
1947 pfnmerge = iov_pfn | phy_pfn;
1949 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1950 pages >>= VTD_STRIDE_SHIFT;
1953 pfnmerge >>= VTD_STRIDE_SHIFT;
1961 * Ensure that old small page tables are removed to make room for superpage(s).
1962 * We're going to add new large pages, so make sure we don't remove their parent
1963 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1965 static void switch_to_super_page(struct dmar_domain *domain,
1966 unsigned long start_pfn,
1967 unsigned long end_pfn, int level)
1969 unsigned long lvl_pages = lvl_to_nr_pages(level);
1970 struct iommu_domain_info *info;
1971 struct dma_pte *pte = NULL;
1974 while (start_pfn <= end_pfn) {
1976 pte = pfn_to_dma_pte(domain, start_pfn, &level,
1979 if (dma_pte_present(pte)) {
1980 dma_pte_free_pagetable(domain, start_pfn,
1981 start_pfn + lvl_pages - 1,
1984 xa_for_each(&domain->iommu_array, i, info)
1985 iommu_flush_iotlb_psi(info->iommu, domain,
1986 start_pfn, lvl_pages,
1991 start_pfn += lvl_pages;
1992 if (first_pte_in_page(pte))
1998 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1999 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2002 struct dma_pte *first_pte = NULL, *pte = NULL;
2003 unsigned int largepage_lvl = 0;
2004 unsigned long lvl_pages = 0;
2008 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2011 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2014 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2015 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2019 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2020 attr |= DMA_FL_PTE_PRESENT;
2021 if (domain->use_first_level) {
2022 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2023 if (prot & DMA_PTE_WRITE)
2024 attr |= DMA_FL_PTE_DIRTY;
2027 domain->has_mappings = true;
2029 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2031 while (nr_pages > 0) {
2035 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2036 phys_pfn, nr_pages);
2038 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2044 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2046 /* It is large page*/
2047 if (largepage_lvl > 1) {
2048 unsigned long end_pfn;
2049 unsigned long pages_to_remove;
2051 pteval |= DMA_PTE_LARGE_PAGE;
2052 pages_to_remove = min_t(unsigned long, nr_pages,
2053 nr_pte_to_next_page(pte) * lvl_pages);
2054 end_pfn = iov_pfn + pages_to_remove - 1;
2055 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2057 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2061 /* We don't need lock here, nobody else
2062 * touches the iova range
2064 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2066 static int dumps = 5;
2067 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2068 iov_pfn, tmp, (unsigned long long)pteval);
2071 debug_dma_dump_mappings(NULL);
2076 nr_pages -= lvl_pages;
2077 iov_pfn += lvl_pages;
2078 phys_pfn += lvl_pages;
2079 pteval += lvl_pages * VTD_PAGE_SIZE;
2081 /* If the next PTE would be the first in a new page, then we
2082 * need to flush the cache on the entries we've just written.
2083 * And then we'll need to recalculate 'pte', so clear it and
2084 * let it get set again in the if (!pte) block above.
2086 * If we're done (!nr_pages) we need to flush the cache too.
2088 * Also if we've been setting superpages, we may need to
2089 * recalculate 'pte' and switch back to smaller pages for the
2090 * end of the mapping, if the trailing size is not enough to
2091 * use another superpage (i.e. nr_pages < lvl_pages).
2094 if (!nr_pages || first_pte_in_page(pte) ||
2095 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2096 domain_flush_cache(domain, first_pte,
2097 (void *)pte - (void *)first_pte);
2105 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2107 struct intel_iommu *iommu = info->iommu;
2108 struct context_entry *context;
2114 spin_lock(&iommu->lock);
2115 context = iommu_context_addr(iommu, bus, devfn, 0);
2117 spin_unlock(&iommu->lock);
2121 if (sm_supported(iommu)) {
2122 if (hw_pass_through && domain_type_is_si(info->domain))
2123 did_old = FLPT_DEFAULT_DID;
2125 did_old = domain_id_iommu(info->domain, iommu);
2127 did_old = context_domain_id(context);
2130 context_clear_entry(context);
2131 __iommu_flush_cache(iommu, context, sizeof(*context));
2132 spin_unlock(&iommu->lock);
2133 iommu->flush.flush_context(iommu,
2135 (((u16)bus) << 8) | devfn,
2136 DMA_CCMD_MASK_NOBIT,
2137 DMA_CCMD_DEVICE_INVL);
2139 if (sm_supported(iommu))
2140 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2142 iommu->flush.flush_iotlb(iommu,
2148 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2151 static int domain_setup_first_level(struct intel_iommu *iommu,
2152 struct dmar_domain *domain,
2156 struct dma_pte *pgd = domain->pgd;
2161 * Skip top levels of page tables for iommu which has
2162 * less agaw than default. Unnecessary for PT mode.
2164 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2165 pgd = phys_to_virt(dma_pte_addr(pgd));
2166 if (!dma_pte_present(pgd))
2170 level = agaw_to_level(agaw);
2171 if (level != 4 && level != 5)
2175 flags |= PASID_FLAG_FL5LP;
2177 if (domain->force_snooping)
2178 flags |= PASID_FLAG_PAGE_SNOOP;
2180 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2181 domain_id_iommu(domain, iommu),
2185 static bool dev_is_real_dma_subdevice(struct device *dev)
2187 return dev && dev_is_pci(dev) &&
2188 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2191 static int iommu_domain_identity_map(struct dmar_domain *domain,
2192 unsigned long first_vpfn,
2193 unsigned long last_vpfn)
2196 * RMRR range might have overlap with physical memory range,
2199 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2201 return __domain_mapping(domain, first_vpfn,
2202 first_vpfn, last_vpfn - first_vpfn + 1,
2203 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2206 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2208 static int __init si_domain_init(int hw)
2210 struct dmar_rmrr_unit *rmrr;
2214 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2218 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2219 domain_exit(si_domain);
2227 for_each_online_node(nid) {
2228 unsigned long start_pfn, end_pfn;
2231 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2232 ret = iommu_domain_identity_map(si_domain,
2233 mm_to_dma_pfn_start(start_pfn),
2234 mm_to_dma_pfn_end(end_pfn));
2241 * Identity map the RMRRs so that devices with RMRRs could also use
2244 for_each_rmrr_units(rmrr) {
2245 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2247 unsigned long long start = rmrr->base_address;
2248 unsigned long long end = rmrr->end_address;
2250 if (WARN_ON(end < start ||
2251 end >> agaw_to_width(si_domain->agaw)))
2254 ret = iommu_domain_identity_map(si_domain,
2255 mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2256 mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2265 static int dmar_domain_attach_device(struct dmar_domain *domain,
2268 struct device_domain_info *info = dev_iommu_priv_get(dev);
2269 struct intel_iommu *iommu = info->iommu;
2270 unsigned long flags;
2273 ret = domain_attach_iommu(domain, iommu);
2276 info->domain = domain;
2277 spin_lock_irqsave(&domain->lock, flags);
2278 list_add(&info->link, &domain->devices);
2279 spin_unlock_irqrestore(&domain->lock, flags);
2281 /* PASID table is mandatory for a PCI device in scalable mode. */
2282 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2283 /* Setup the PASID entry for requests without PASID: */
2284 if (hw_pass_through && domain_type_is_si(domain))
2285 ret = intel_pasid_setup_pass_through(iommu,
2286 dev, IOMMU_NO_PASID);
2287 else if (domain->use_first_level)
2288 ret = domain_setup_first_level(iommu, domain, dev,
2291 ret = intel_pasid_setup_second_level(iommu, domain,
2292 dev, IOMMU_NO_PASID);
2294 dev_err(dev, "Setup RID2PASID failed\n");
2295 device_block_translation(dev);
2300 ret = domain_context_mapping(domain, dev);
2302 dev_err(dev, "Domain context map failed\n");
2303 device_block_translation(dev);
2307 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2308 iommu_enable_pci_caps(info);
2314 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2315 * is relaxable (ie. is allowed to be not enforced under some conditions)
2316 * @dev: device handle
2318 * We assume that PCI USB devices with RMRRs have them largely
2319 * for historical reasons and that the RMRR space is not actively used post
2320 * boot. This exclusion may change if vendors begin to abuse it.
2322 * The same exception is made for graphics devices, with the requirement that
2323 * any use of the RMRR regions will be torn down before assigning the device
2326 * Return: true if the RMRR is relaxable, false otherwise
2328 static bool device_rmrr_is_relaxable(struct device *dev)
2330 struct pci_dev *pdev;
2332 if (!dev_is_pci(dev))
2335 pdev = to_pci_dev(dev);
2336 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2343 * Return the required default domain type for a specific device.
2345 * @dev: the device in query
2346 * @startup: true if this is during early boot
2349 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2350 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2351 * - 0: both identity and dynamic domains work for this device
2353 static int device_def_domain_type(struct device *dev)
2355 if (dev_is_pci(dev)) {
2356 struct pci_dev *pdev = to_pci_dev(dev);
2358 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2359 return IOMMU_DOMAIN_IDENTITY;
2361 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2362 return IOMMU_DOMAIN_IDENTITY;
2368 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2371 * Start from the sane iommu hardware state.
2372 * If the queued invalidation is already initialized by us
2373 * (for example, while enabling interrupt-remapping) then
2374 * we got the things already rolling from a sane state.
2378 * Clear any previous faults.
2380 dmar_fault(-1, iommu);
2382 * Disable queued invalidation if supported and already enabled
2383 * before OS handover.
2385 dmar_disable_qi(iommu);
2388 if (dmar_enable_qi(iommu)) {
2390 * Queued Invalidate not enabled, use Register Based Invalidate
2392 iommu->flush.flush_context = __iommu_flush_context;
2393 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2394 pr_info("%s: Using Register based invalidation\n",
2397 iommu->flush.flush_context = qi_flush_context;
2398 iommu->flush.flush_iotlb = qi_flush_iotlb;
2399 pr_info("%s: Using Queued invalidation\n", iommu->name);
2403 static int copy_context_table(struct intel_iommu *iommu,
2404 struct root_entry *old_re,
2405 struct context_entry **tbl,
2408 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2409 struct context_entry *new_ce = NULL, ce;
2410 struct context_entry *old_ce = NULL;
2411 struct root_entry re;
2412 phys_addr_t old_ce_phys;
2414 tbl_idx = ext ? bus * 2 : bus;
2415 memcpy(&re, old_re, sizeof(re));
2417 for (devfn = 0; devfn < 256; devfn++) {
2418 /* First calculate the correct index */
2419 idx = (ext ? devfn * 2 : devfn) % 256;
2422 /* First save what we may have and clean up */
2424 tbl[tbl_idx] = new_ce;
2425 __iommu_flush_cache(iommu, new_ce,
2435 old_ce_phys = root_entry_lctp(&re);
2437 old_ce_phys = root_entry_uctp(&re);
2440 if (ext && devfn == 0) {
2441 /* No LCTP, try UCTP */
2450 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2455 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2462 /* Now copy the context entry */
2463 memcpy(&ce, old_ce + idx, sizeof(ce));
2465 if (!context_present(&ce))
2468 did = context_domain_id(&ce);
2469 if (did >= 0 && did < cap_ndoms(iommu->cap))
2470 set_bit(did, iommu->domain_ids);
2472 set_context_copied(iommu, bus, devfn);
2476 tbl[tbl_idx + pos] = new_ce;
2478 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2487 static int copy_translation_tables(struct intel_iommu *iommu)
2489 struct context_entry **ctxt_tbls;
2490 struct root_entry *old_rt;
2491 phys_addr_t old_rt_phys;
2492 int ctxt_table_entries;
2497 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2498 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2499 new_ext = !!sm_supported(iommu);
2502 * The RTT bit can only be changed when translation is disabled,
2503 * but disabling translation means to open a window for data
2504 * corruption. So bail out and don't copy anything if we would
2505 * have to change the bit.
2510 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2511 if (!iommu->copied_tables)
2514 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2518 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2522 /* This is too big for the stack - allocate it from slab */
2523 ctxt_table_entries = ext ? 512 : 256;
2525 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2529 for (bus = 0; bus < 256; bus++) {
2530 ret = copy_context_table(iommu, &old_rt[bus],
2531 ctxt_tbls, bus, ext);
2533 pr_err("%s: Failed to copy context table for bus %d\n",
2539 spin_lock(&iommu->lock);
2541 /* Context tables are copied, now write them to the root_entry table */
2542 for (bus = 0; bus < 256; bus++) {
2543 int idx = ext ? bus * 2 : bus;
2546 if (ctxt_tbls[idx]) {
2547 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2548 iommu->root_entry[bus].lo = val;
2551 if (!ext || !ctxt_tbls[idx + 1])
2554 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2555 iommu->root_entry[bus].hi = val;
2558 spin_unlock(&iommu->lock);
2562 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2572 static int __init init_dmars(void)
2574 struct dmar_drhd_unit *drhd;
2575 struct intel_iommu *iommu;
2578 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2582 for_each_iommu(iommu, drhd) {
2583 if (drhd->ignored) {
2584 iommu_disable_translation(iommu);
2589 * Find the max pasid size of all IOMMU's in the system.
2590 * We need to ensure the system pasid table is no bigger
2591 * than the smallest supported.
2593 if (pasid_supported(iommu)) {
2594 u32 temp = 2 << ecap_pss(iommu->ecap);
2596 intel_pasid_max_id = min_t(u32, temp,
2597 intel_pasid_max_id);
2600 intel_iommu_init_qi(iommu);
2602 ret = iommu_init_domains(iommu);
2606 init_translation_status(iommu);
2608 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2609 iommu_disable_translation(iommu);
2610 clear_translation_pre_enabled(iommu);
2611 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2617 * we could share the same root & context tables
2618 * among all IOMMU's. Need to Split it later.
2620 ret = iommu_alloc_root_entry(iommu);
2624 if (translation_pre_enabled(iommu)) {
2625 pr_info("Translation already enabled - trying to copy translation structures\n");
2627 ret = copy_translation_tables(iommu);
2630 * We found the IOMMU with translation
2631 * enabled - but failed to copy over the
2632 * old root-entry table. Try to proceed
2633 * by disabling translation now and
2634 * allocating a clean root-entry table.
2635 * This might cause DMAR faults, but
2636 * probably the dump will still succeed.
2638 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2640 iommu_disable_translation(iommu);
2641 clear_translation_pre_enabled(iommu);
2643 pr_info("Copied translation tables from previous kernel for %s\n",
2648 if (!ecap_pass_through(iommu->ecap))
2649 hw_pass_through = 0;
2650 intel_svm_check(iommu);
2654 * Now that qi is enabled on all iommus, set the root entry and flush
2655 * caches. This is required on some Intel X58 chipsets, otherwise the
2656 * flush_context function will loop forever and the boot hangs.
2658 for_each_active_iommu(iommu, drhd) {
2659 iommu_flush_write_buffer(iommu);
2660 iommu_set_root_entry(iommu);
2663 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2668 iommu_identity_mapping |= IDENTMAP_GFX;
2670 check_tylersburg_isoch();
2672 ret = si_domain_init(hw_pass_through);
2679 * global invalidate context cache
2680 * global invalidate iotlb
2681 * enable translation
2683 for_each_iommu(iommu, drhd) {
2684 if (drhd->ignored) {
2686 * we always have to disable PMRs or DMA may fail on
2690 iommu_disable_protect_mem_regions(iommu);
2694 iommu_flush_write_buffer(iommu);
2696 #ifdef CONFIG_INTEL_IOMMU_SVM
2697 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2699 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2700 * could cause possible lock race condition.
2702 up_write(&dmar_global_lock);
2703 ret = intel_svm_enable_prq(iommu);
2704 down_write(&dmar_global_lock);
2709 ret = dmar_set_interrupt(iommu);
2717 for_each_active_iommu(iommu, drhd) {
2718 disable_dmar_iommu(iommu);
2719 free_dmar_iommu(iommu);
2722 domain_exit(si_domain);
2729 static void __init init_no_remapping_devices(void)
2731 struct dmar_drhd_unit *drhd;
2735 for_each_drhd_unit(drhd) {
2736 if (!drhd->include_all) {
2737 for_each_active_dev_scope(drhd->devices,
2738 drhd->devices_cnt, i, dev)
2740 /* ignore DMAR unit if no devices exist */
2741 if (i == drhd->devices_cnt)
2746 for_each_active_drhd_unit(drhd) {
2747 if (drhd->include_all)
2750 for_each_active_dev_scope(drhd->devices,
2751 drhd->devices_cnt, i, dev)
2752 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2754 if (i < drhd->devices_cnt)
2757 /* This IOMMU has *only* gfx devices. Either bypass it or
2758 set the gfx_mapped flag, as appropriate */
2759 drhd->gfx_dedicated = 1;
2765 #ifdef CONFIG_SUSPEND
2766 static int init_iommu_hw(void)
2768 struct dmar_drhd_unit *drhd;
2769 struct intel_iommu *iommu = NULL;
2772 for_each_active_iommu(iommu, drhd) {
2774 ret = dmar_reenable_qi(iommu);
2780 for_each_iommu(iommu, drhd) {
2781 if (drhd->ignored) {
2783 * we always have to disable PMRs or DMA may fail on
2787 iommu_disable_protect_mem_regions(iommu);
2791 iommu_flush_write_buffer(iommu);
2792 iommu_set_root_entry(iommu);
2793 iommu_enable_translation(iommu);
2794 iommu_disable_protect_mem_regions(iommu);
2800 static void iommu_flush_all(void)
2802 struct dmar_drhd_unit *drhd;
2803 struct intel_iommu *iommu;
2805 for_each_active_iommu(iommu, drhd) {
2806 iommu->flush.flush_context(iommu, 0, 0, 0,
2807 DMA_CCMD_GLOBAL_INVL);
2808 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2809 DMA_TLB_GLOBAL_FLUSH);
2813 static int iommu_suspend(void)
2815 struct dmar_drhd_unit *drhd;
2816 struct intel_iommu *iommu = NULL;
2821 for_each_active_iommu(iommu, drhd) {
2822 iommu_disable_translation(iommu);
2824 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2826 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2827 readl(iommu->reg + DMAR_FECTL_REG);
2828 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2829 readl(iommu->reg + DMAR_FEDATA_REG);
2830 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2831 readl(iommu->reg + DMAR_FEADDR_REG);
2832 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2833 readl(iommu->reg + DMAR_FEUADDR_REG);
2835 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2840 static void iommu_resume(void)
2842 struct dmar_drhd_unit *drhd;
2843 struct intel_iommu *iommu = NULL;
2846 if (init_iommu_hw()) {
2848 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2850 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2854 for_each_active_iommu(iommu, drhd) {
2856 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2858 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2859 iommu->reg + DMAR_FECTL_REG);
2860 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2861 iommu->reg + DMAR_FEDATA_REG);
2862 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2863 iommu->reg + DMAR_FEADDR_REG);
2864 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2865 iommu->reg + DMAR_FEUADDR_REG);
2867 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2871 static struct syscore_ops iommu_syscore_ops = {
2872 .resume = iommu_resume,
2873 .suspend = iommu_suspend,
2876 static void __init init_iommu_pm_ops(void)
2878 register_syscore_ops(&iommu_syscore_ops);
2882 static inline void init_iommu_pm_ops(void) {}
2883 #endif /* CONFIG_PM */
2885 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2887 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2888 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2889 rmrr->end_address <= rmrr->base_address ||
2890 arch_rmrr_sanity_check(rmrr))
2896 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2898 struct acpi_dmar_reserved_memory *rmrr;
2899 struct dmar_rmrr_unit *rmrru;
2901 rmrr = (struct acpi_dmar_reserved_memory *)header;
2902 if (rmrr_sanity_check(rmrr)) {
2904 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2905 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2906 rmrr->base_address, rmrr->end_address,
2907 dmi_get_system_info(DMI_BIOS_VENDOR),
2908 dmi_get_system_info(DMI_BIOS_VERSION),
2909 dmi_get_system_info(DMI_PRODUCT_VERSION));
2910 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2913 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2917 rmrru->hdr = header;
2919 rmrru->base_address = rmrr->base_address;
2920 rmrru->end_address = rmrr->end_address;
2922 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2923 ((void *)rmrr) + rmrr->header.length,
2924 &rmrru->devices_cnt);
2925 if (rmrru->devices_cnt && rmrru->devices == NULL)
2928 list_add(&rmrru->list, &dmar_rmrr_units);
2937 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2939 struct dmar_atsr_unit *atsru;
2940 struct acpi_dmar_atsr *tmp;
2942 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2944 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2945 if (atsr->segment != tmp->segment)
2947 if (atsr->header.length != tmp->header.length)
2949 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2956 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2958 struct acpi_dmar_atsr *atsr;
2959 struct dmar_atsr_unit *atsru;
2961 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2964 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2965 atsru = dmar_find_atsr(atsr);
2969 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2974 * If memory is allocated from slab by ACPI _DSM method, we need to
2975 * copy the memory content because the memory buffer will be freed
2978 atsru->hdr = (void *)(atsru + 1);
2979 memcpy(atsru->hdr, hdr, hdr->length);
2980 atsru->include_all = atsr->flags & 0x1;
2981 if (!atsru->include_all) {
2982 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2983 (void *)atsr + atsr->header.length,
2984 &atsru->devices_cnt);
2985 if (atsru->devices_cnt && atsru->devices == NULL) {
2991 list_add_rcu(&atsru->list, &dmar_atsr_units);
2996 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2998 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3002 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3004 struct acpi_dmar_atsr *atsr;
3005 struct dmar_atsr_unit *atsru;
3007 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3008 atsru = dmar_find_atsr(atsr);
3010 list_del_rcu(&atsru->list);
3012 intel_iommu_free_atsr(atsru);
3018 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3022 struct acpi_dmar_atsr *atsr;
3023 struct dmar_atsr_unit *atsru;
3025 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3026 atsru = dmar_find_atsr(atsr);
3030 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3031 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3039 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3041 struct dmar_satc_unit *satcu;
3042 struct acpi_dmar_satc *tmp;
3044 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3046 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3047 if (satc->segment != tmp->segment)
3049 if (satc->header.length != tmp->header.length)
3051 if (memcmp(satc, tmp, satc->header.length) == 0)
3058 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3060 struct acpi_dmar_satc *satc;
3061 struct dmar_satc_unit *satcu;
3063 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3066 satc = container_of(hdr, struct acpi_dmar_satc, header);
3067 satcu = dmar_find_satc(satc);
3071 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3075 satcu->hdr = (void *)(satcu + 1);
3076 memcpy(satcu->hdr, hdr, hdr->length);
3077 satcu->atc_required = satc->flags & 0x1;
3078 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3079 (void *)satc + satc->header.length,
3080 &satcu->devices_cnt);
3081 if (satcu->devices_cnt && !satcu->devices) {
3085 list_add_rcu(&satcu->list, &dmar_satc_units);
3090 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3093 struct intel_iommu *iommu = dmaru->iommu;
3095 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3099 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3100 pr_warn("%s: Doesn't support hardware pass through.\n",
3105 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3106 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3107 pr_warn("%s: Doesn't support large page.\n",
3113 * Disable translation if already enabled prior to OS handover.
3115 if (iommu->gcmd & DMA_GCMD_TE)
3116 iommu_disable_translation(iommu);
3118 ret = iommu_init_domains(iommu);
3120 ret = iommu_alloc_root_entry(iommu);
3124 intel_svm_check(iommu);
3126 if (dmaru->ignored) {
3128 * we always have to disable PMRs or DMA may fail on this device
3131 iommu_disable_protect_mem_regions(iommu);
3135 intel_iommu_init_qi(iommu);
3136 iommu_flush_write_buffer(iommu);
3138 #ifdef CONFIG_INTEL_IOMMU_SVM
3139 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3140 ret = intel_svm_enable_prq(iommu);
3145 ret = dmar_set_interrupt(iommu);
3149 iommu_set_root_entry(iommu);
3150 iommu_enable_translation(iommu);
3152 iommu_disable_protect_mem_regions(iommu);
3156 disable_dmar_iommu(iommu);
3158 free_dmar_iommu(iommu);
3162 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3165 struct intel_iommu *iommu = dmaru->iommu;
3167 if (!intel_iommu_enabled)
3173 ret = intel_iommu_add(dmaru);
3175 disable_dmar_iommu(iommu);
3176 free_dmar_iommu(iommu);
3182 static void intel_iommu_free_dmars(void)
3184 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3185 struct dmar_atsr_unit *atsru, *atsr_n;
3186 struct dmar_satc_unit *satcu, *satc_n;
3188 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3189 list_del(&rmrru->list);
3190 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3194 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3195 list_del(&atsru->list);
3196 intel_iommu_free_atsr(atsru);
3198 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3199 list_del(&satcu->list);
3200 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3205 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3207 struct dmar_satc_unit *satcu;
3208 struct acpi_dmar_satc *satc;
3212 dev = pci_physfn(dev);
3215 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3216 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3217 if (satc->segment != pci_domain_nr(dev->bus))
3219 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3220 if (to_pci_dev(tmp) == dev)
3229 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3232 struct pci_bus *bus;
3233 struct pci_dev *bridge = NULL;
3235 struct acpi_dmar_atsr *atsr;
3236 struct dmar_atsr_unit *atsru;
3237 struct dmar_satc_unit *satcu;
3239 dev = pci_physfn(dev);
3240 satcu = dmar_find_matched_satc_unit(dev);
3243 * This device supports ATS as it is in SATC table.
3244 * When IOMMU is in legacy mode, enabling ATS is done
3245 * automatically by HW for the device that requires
3246 * ATS, hence OS should not enable this device ATS
3247 * to avoid duplicated TLB invalidation.
3249 return !(satcu->atc_required && !sm_supported(iommu));
3251 for (bus = dev->bus; bus; bus = bus->parent) {
3253 /* If it's an integrated device, allow ATS */
3256 /* Connected via non-PCIe: no ATS */
3257 if (!pci_is_pcie(bridge) ||
3258 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3260 /* If we found the root port, look it up in the ATSR */
3261 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3266 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3267 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3268 if (atsr->segment != pci_domain_nr(dev->bus))
3271 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3272 if (tmp == &bridge->dev)
3275 if (atsru->include_all)
3285 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3288 struct dmar_rmrr_unit *rmrru;
3289 struct dmar_atsr_unit *atsru;
3290 struct dmar_satc_unit *satcu;
3291 struct acpi_dmar_atsr *atsr;
3292 struct acpi_dmar_reserved_memory *rmrr;
3293 struct acpi_dmar_satc *satc;
3295 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3298 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3299 rmrr = container_of(rmrru->hdr,
3300 struct acpi_dmar_reserved_memory, header);
3301 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3302 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3303 ((void *)rmrr) + rmrr->header.length,
3304 rmrr->segment, rmrru->devices,
3305 rmrru->devices_cnt);
3308 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3309 dmar_remove_dev_scope(info, rmrr->segment,
3310 rmrru->devices, rmrru->devices_cnt);
3314 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3315 if (atsru->include_all)
3318 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3319 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3320 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3321 (void *)atsr + atsr->header.length,
3322 atsr->segment, atsru->devices,
3323 atsru->devices_cnt);
3328 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3329 if (dmar_remove_dev_scope(info, atsr->segment,
3330 atsru->devices, atsru->devices_cnt))
3334 list_for_each_entry(satcu, &dmar_satc_units, list) {
3335 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3336 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3337 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3338 (void *)satc + satc->header.length,
3339 satc->segment, satcu->devices,
3340 satcu->devices_cnt);
3345 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3346 if (dmar_remove_dev_scope(info, satc->segment,
3347 satcu->devices, satcu->devices_cnt))
3355 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3356 unsigned long val, void *v)
3358 struct memory_notify *mhp = v;
3359 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3360 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3364 case MEM_GOING_ONLINE:
3365 if (iommu_domain_identity_map(si_domain,
3366 start_vpfn, last_vpfn)) {
3367 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3368 start_vpfn, last_vpfn);
3374 case MEM_CANCEL_ONLINE:
3376 struct dmar_drhd_unit *drhd;
3377 struct intel_iommu *iommu;
3378 LIST_HEAD(freelist);
3380 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3383 for_each_active_iommu(iommu, drhd)
3384 iommu_flush_iotlb_psi(iommu, si_domain,
3385 start_vpfn, mhp->nr_pages,
3386 list_empty(&freelist), 0);
3388 put_pages_list(&freelist);
3396 static struct notifier_block intel_iommu_memory_nb = {
3397 .notifier_call = intel_iommu_memory_notifier,
3401 static void intel_disable_iommus(void)
3403 struct intel_iommu *iommu = NULL;
3404 struct dmar_drhd_unit *drhd;
3406 for_each_iommu(iommu, drhd)
3407 iommu_disable_translation(iommu);
3410 void intel_iommu_shutdown(void)
3412 struct dmar_drhd_unit *drhd;
3413 struct intel_iommu *iommu = NULL;
3415 if (no_iommu || dmar_disabled)
3418 down_write(&dmar_global_lock);
3420 /* Disable PMRs explicitly here. */
3421 for_each_iommu(iommu, drhd)
3422 iommu_disable_protect_mem_regions(iommu);
3424 /* Make sure the IOMMUs are switched off */
3425 intel_disable_iommus();
3427 up_write(&dmar_global_lock);
3430 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3432 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3434 return container_of(iommu_dev, struct intel_iommu, iommu);
3437 static ssize_t version_show(struct device *dev,
3438 struct device_attribute *attr, char *buf)
3440 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3441 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3442 return sysfs_emit(buf, "%d:%d\n",
3443 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3445 static DEVICE_ATTR_RO(version);
3447 static ssize_t address_show(struct device *dev,
3448 struct device_attribute *attr, char *buf)
3450 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3451 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3453 static DEVICE_ATTR_RO(address);
3455 static ssize_t cap_show(struct device *dev,
3456 struct device_attribute *attr, char *buf)
3458 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3459 return sysfs_emit(buf, "%llx\n", iommu->cap);
3461 static DEVICE_ATTR_RO(cap);
3463 static ssize_t ecap_show(struct device *dev,
3464 struct device_attribute *attr, char *buf)
3466 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3467 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3469 static DEVICE_ATTR_RO(ecap);
3471 static ssize_t domains_supported_show(struct device *dev,
3472 struct device_attribute *attr, char *buf)
3474 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3475 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3477 static DEVICE_ATTR_RO(domains_supported);
3479 static ssize_t domains_used_show(struct device *dev,
3480 struct device_attribute *attr, char *buf)
3482 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3483 return sysfs_emit(buf, "%d\n",
3484 bitmap_weight(iommu->domain_ids,
3485 cap_ndoms(iommu->cap)));
3487 static DEVICE_ATTR_RO(domains_used);
3489 static struct attribute *intel_iommu_attrs[] = {
3490 &dev_attr_version.attr,
3491 &dev_attr_address.attr,
3493 &dev_attr_ecap.attr,
3494 &dev_attr_domains_supported.attr,
3495 &dev_attr_domains_used.attr,
3499 static struct attribute_group intel_iommu_group = {
3500 .name = "intel-iommu",
3501 .attrs = intel_iommu_attrs,
3504 const struct attribute_group *intel_iommu_groups[] = {
3509 static bool has_external_pci(void)
3511 struct pci_dev *pdev = NULL;
3513 for_each_pci_dev(pdev)
3514 if (pdev->external_facing) {
3522 static int __init platform_optin_force_iommu(void)
3524 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3527 if (no_iommu || dmar_disabled)
3528 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3531 * If Intel-IOMMU is disabled by default, we will apply identity
3532 * map for all devices except those marked as being untrusted.
3535 iommu_set_default_passthrough(false);
3543 static int __init probe_acpi_namespace_devices(void)
3545 struct dmar_drhd_unit *drhd;
3546 /* To avoid a -Wunused-but-set-variable warning. */
3547 struct intel_iommu *iommu __maybe_unused;
3551 for_each_active_iommu(iommu, drhd) {
3552 for_each_active_dev_scope(drhd->devices,
3553 drhd->devices_cnt, i, dev) {
3554 struct acpi_device_physical_node *pn;
3555 struct acpi_device *adev;
3557 if (dev->bus != &acpi_bus_type)
3560 adev = to_acpi_device(dev);
3561 mutex_lock(&adev->physical_node_lock);
3562 list_for_each_entry(pn,
3563 &adev->physical_node_list, node) {
3564 ret = iommu_probe_device(pn->dev);
3568 mutex_unlock(&adev->physical_node_lock);
3578 static __init int tboot_force_iommu(void)
3580 if (!tboot_enabled())
3583 if (no_iommu || dmar_disabled)
3584 pr_warn("Forcing Intel-IOMMU to enabled\n");
3592 int __init intel_iommu_init(void)
3595 struct dmar_drhd_unit *drhd;
3596 struct intel_iommu *iommu;
3599 * Intel IOMMU is required for a TXT/tboot launch or platform
3600 * opt in, so enforce that.
3602 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3603 platform_optin_force_iommu();
3605 down_write(&dmar_global_lock);
3606 if (dmar_table_init()) {
3608 panic("tboot: Failed to initialize DMAR table\n");
3612 if (dmar_dev_scope_init() < 0) {
3614 panic("tboot: Failed to initialize DMAR device scope\n");
3618 up_write(&dmar_global_lock);
3621 * The bus notifier takes the dmar_global_lock, so lockdep will
3622 * complain later when we register it under the lock.
3624 dmar_register_bus_notifier();
3626 down_write(&dmar_global_lock);
3629 intel_iommu_debugfs_init();
3631 if (no_iommu || dmar_disabled) {
3633 * We exit the function here to ensure IOMMU's remapping and
3634 * mempool aren't setup, which means that the IOMMU's PMRs
3635 * won't be disabled via the call to init_dmars(). So disable
3636 * it explicitly here. The PMRs were setup by tboot prior to
3637 * calling SENTER, but the kernel is expected to reset/tear
3640 if (intel_iommu_tboot_noforce) {
3641 for_each_iommu(iommu, drhd)
3642 iommu_disable_protect_mem_regions(iommu);
3646 * Make sure the IOMMUs are switched off, even when we
3647 * boot into a kexec kernel and the previous kernel left
3650 intel_disable_iommus();
3654 if (list_empty(&dmar_rmrr_units))
3655 pr_info("No RMRR found\n");
3657 if (list_empty(&dmar_atsr_units))
3658 pr_info("No ATSR found\n");
3660 if (list_empty(&dmar_satc_units))
3661 pr_info("No SATC found\n");
3663 init_no_remapping_devices();
3668 panic("tboot: Failed to initialize DMARs\n");
3669 pr_err("Initialization failed\n");
3672 up_write(&dmar_global_lock);
3674 init_iommu_pm_ops();
3676 down_read(&dmar_global_lock);
3677 for_each_active_iommu(iommu, drhd) {
3679 * The flush queue implementation does not perform
3680 * page-selective invalidations that are required for efficient
3681 * TLB flushes in virtual environments. The benefit of batching
3682 * is likely to be much lower than the overhead of synchronizing
3683 * the virtual and physical IOMMU page-tables.
3685 if (cap_caching_mode(iommu->cap) &&
3686 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3687 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3688 iommu_set_dma_strict();
3690 iommu_device_sysfs_add(&iommu->iommu, NULL,
3693 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3695 iommu_pmu_register(iommu);
3697 up_read(&dmar_global_lock);
3699 if (si_domain && !hw_pass_through)
3700 register_memory_notifier(&intel_iommu_memory_nb);
3702 down_read(&dmar_global_lock);
3703 if (probe_acpi_namespace_devices())
3704 pr_warn("ACPI name space devices didn't probe correctly\n");
3706 /* Finally, we enable the DMA remapping hardware. */
3707 for_each_iommu(iommu, drhd) {
3708 if (!drhd->ignored && !translation_pre_enabled(iommu))
3709 iommu_enable_translation(iommu);
3711 iommu_disable_protect_mem_regions(iommu);
3713 up_read(&dmar_global_lock);
3715 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3717 intel_iommu_enabled = 1;
3722 intel_iommu_free_dmars();
3723 up_write(&dmar_global_lock);
3727 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3729 struct device_domain_info *info = opaque;
3731 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3736 * NB - intel-iommu lacks any sort of reference counting for the users of
3737 * dependent devices. If multiple endpoints have intersecting dependent
3738 * devices, unbinding the driver from any one of them will possibly leave
3739 * the others unable to operate.
3741 static void domain_context_clear(struct device_domain_info *info)
3743 if (!dev_is_pci(info->dev))
3744 domain_context_clear_one(info, info->bus, info->devfn);
3746 pci_for_each_dma_alias(to_pci_dev(info->dev),
3747 &domain_context_clear_one_cb, info);
3750 static void dmar_remove_one_dev_info(struct device *dev)
3752 struct device_domain_info *info = dev_iommu_priv_get(dev);
3753 struct dmar_domain *domain = info->domain;
3754 struct intel_iommu *iommu = info->iommu;
3755 unsigned long flags;
3757 if (!dev_is_real_dma_subdevice(info->dev)) {
3758 if (dev_is_pci(info->dev) && sm_supported(iommu))
3759 intel_pasid_tear_down_entry(iommu, info->dev,
3760 IOMMU_NO_PASID, false);
3762 iommu_disable_pci_caps(info);
3763 domain_context_clear(info);
3766 spin_lock_irqsave(&domain->lock, flags);
3767 list_del(&info->link);
3768 spin_unlock_irqrestore(&domain->lock, flags);
3770 domain_detach_iommu(domain, iommu);
3771 info->domain = NULL;
3775 * Clear the page table pointer in context or pasid table entries so that
3776 * all DMA requests without PASID from the device are blocked. If the page
3777 * table has been set, clean up the data structures.
3779 void device_block_translation(struct device *dev)
3781 struct device_domain_info *info = dev_iommu_priv_get(dev);
3782 struct intel_iommu *iommu = info->iommu;
3783 unsigned long flags;
3785 iommu_disable_pci_caps(info);
3786 if (!dev_is_real_dma_subdevice(dev)) {
3787 if (sm_supported(iommu))
3788 intel_pasid_tear_down_entry(iommu, dev,
3789 IOMMU_NO_PASID, false);
3791 domain_context_clear(info);
3797 spin_lock_irqsave(&info->domain->lock, flags);
3798 list_del(&info->link);
3799 spin_unlock_irqrestore(&info->domain->lock, flags);
3801 domain_detach_iommu(info->domain, iommu);
3802 info->domain = NULL;
3805 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3809 /* calculate AGAW */
3810 domain->gaw = guest_width;
3811 adjust_width = guestwidth_to_adjustwidth(guest_width);
3812 domain->agaw = width_to_agaw(adjust_width);
3814 domain->iommu_coherency = false;
3815 domain->iommu_superpage = 0;
3816 domain->max_addr = 0;
3818 /* always allocate the top pgd */
3819 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3822 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3826 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3829 device_block_translation(dev);
3833 static struct iommu_domain blocking_domain = {
3834 .type = IOMMU_DOMAIN_BLOCKED,
3835 .ops = &(const struct iommu_domain_ops) {
3836 .attach_dev = blocking_domain_attach_dev,
3840 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3842 struct dmar_domain *dmar_domain;
3843 struct iommu_domain *domain;
3846 case IOMMU_DOMAIN_DMA:
3847 case IOMMU_DOMAIN_UNMANAGED:
3848 dmar_domain = alloc_domain(type);
3850 pr_err("Can't allocate dmar_domain\n");
3853 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3854 pr_err("Domain initialization failed\n");
3855 domain_exit(dmar_domain);
3859 domain = &dmar_domain->domain;
3860 domain->geometry.aperture_start = 0;
3861 domain->geometry.aperture_end =
3862 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3863 domain->geometry.force_aperture = true;
3866 case IOMMU_DOMAIN_IDENTITY:
3867 return &si_domain->domain;
3868 case IOMMU_DOMAIN_SVA:
3869 return intel_svm_domain_alloc();
3877 static struct iommu_domain *
3878 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3879 struct iommu_domain *parent,
3880 const struct iommu_user_data *user_data)
3882 struct device_domain_info *info = dev_iommu_priv_get(dev);
3883 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3884 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3885 struct intel_iommu *iommu = info->iommu;
3886 struct iommu_domain *domain;
3888 /* Must be NESTING domain */
3890 if (!nested_supported(iommu) || flags)
3891 return ERR_PTR(-EOPNOTSUPP);
3892 return intel_nested_domain_alloc(parent, user_data);
3896 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3897 return ERR_PTR(-EOPNOTSUPP);
3898 if (nested_parent && !nested_supported(iommu))
3899 return ERR_PTR(-EOPNOTSUPP);
3900 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3901 return ERR_PTR(-EOPNOTSUPP);
3904 * domain_alloc_user op needs to fully initialize a domain before
3905 * return, so uses iommu_domain_alloc() here for simple.
3907 domain = iommu_domain_alloc(dev->bus);
3909 return ERR_PTR(-ENOMEM);
3912 to_dmar_domain(domain)->nested_parent = true;
3914 if (dirty_tracking) {
3915 if (to_dmar_domain(domain)->use_first_level) {
3916 iommu_domain_free(domain);
3917 return ERR_PTR(-EOPNOTSUPP);
3919 domain->dirty_ops = &intel_dirty_ops;
3925 static void intel_iommu_domain_free(struct iommu_domain *domain)
3927 if (domain != &si_domain->domain)
3928 domain_exit(to_dmar_domain(domain));
3931 int prepare_domain_attach_device(struct iommu_domain *domain,
3934 struct device_domain_info *info = dev_iommu_priv_get(dev);
3935 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3936 struct intel_iommu *iommu = info->iommu;
3939 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3942 if (domain->dirty_ops && !ssads_supported(iommu))
3945 /* check if this iommu agaw is sufficient for max mapped address */
3946 addr_width = agaw_to_width(iommu->agaw);
3947 if (addr_width > cap_mgaw(iommu->cap))
3948 addr_width = cap_mgaw(iommu->cap);
3950 if (dmar_domain->max_addr > (1LL << addr_width))
3952 dmar_domain->gaw = addr_width;
3955 * Knock out extra levels of page tables if necessary
3957 while (iommu->agaw < dmar_domain->agaw) {
3958 struct dma_pte *pte;
3960 pte = dmar_domain->pgd;
3961 if (dma_pte_present(pte)) {
3962 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3963 free_pgtable_page(pte);
3965 dmar_domain->agaw--;
3971 static int intel_iommu_attach_device(struct iommu_domain *domain,
3974 struct device_domain_info *info = dev_iommu_priv_get(dev);
3978 device_block_translation(dev);
3980 ret = prepare_domain_attach_device(domain, dev);
3984 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3987 static int intel_iommu_map(struct iommu_domain *domain,
3988 unsigned long iova, phys_addr_t hpa,
3989 size_t size, int iommu_prot, gfp_t gfp)
3991 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3995 if (iommu_prot & IOMMU_READ)
3996 prot |= DMA_PTE_READ;
3997 if (iommu_prot & IOMMU_WRITE)
3998 prot |= DMA_PTE_WRITE;
3999 if (dmar_domain->set_pte_snp)
4000 prot |= DMA_PTE_SNP;
4002 max_addr = iova + size;
4003 if (dmar_domain->max_addr < max_addr) {
4006 /* check if minimum agaw is sufficient for mapped address */
4007 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4008 if (end < max_addr) {
4009 pr_err("%s: iommu width (%d) is not "
4010 "sufficient for the mapped address (%llx)\n",
4011 __func__, dmar_domain->gaw, max_addr);
4014 dmar_domain->max_addr = max_addr;
4016 /* Round up size to next multiple of PAGE_SIZE, if it and
4017 the low bits of hpa would take us onto the next page */
4018 size = aligned_nrpages(hpa, size);
4019 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4020 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4023 static int intel_iommu_map_pages(struct iommu_domain *domain,
4024 unsigned long iova, phys_addr_t paddr,
4025 size_t pgsize, size_t pgcount,
4026 int prot, gfp_t gfp, size_t *mapped)
4028 unsigned long pgshift = __ffs(pgsize);
4029 size_t size = pgcount << pgshift;
4032 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4035 if (!IS_ALIGNED(iova | paddr, pgsize))
4038 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4045 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4046 unsigned long iova, size_t size,
4047 struct iommu_iotlb_gather *gather)
4049 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4050 unsigned long start_pfn, last_pfn;
4053 /* Cope with horrid API which requires us to unmap more than the
4054 size argument if it happens to be a large-page mapping. */
4055 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4056 &level, GFP_ATOMIC)))
4059 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4060 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4062 start_pfn = iova >> VTD_PAGE_SHIFT;
4063 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4065 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4067 if (dmar_domain->max_addr == iova + size)
4068 dmar_domain->max_addr = iova;
4071 * We do not use page-selective IOTLB invalidation in flush queue,
4072 * so there is no need to track page and sync iotlb.
4074 if (!iommu_iotlb_gather_queued(gather))
4075 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4080 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4082 size_t pgsize, size_t pgcount,
4083 struct iommu_iotlb_gather *gather)
4085 unsigned long pgshift = __ffs(pgsize);
4086 size_t size = pgcount << pgshift;
4088 return intel_iommu_unmap(domain, iova, size, gather);
4091 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4092 struct iommu_iotlb_gather *gather)
4094 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4095 unsigned long iova_pfn = IOVA_PFN(gather->start);
4096 size_t size = gather->end - gather->start;
4097 struct iommu_domain_info *info;
4098 unsigned long start_pfn;
4099 unsigned long nrpages;
4102 nrpages = aligned_nrpages(gather->start, size);
4103 start_pfn = mm_to_dma_pfn_start(iova_pfn);
4105 xa_for_each(&dmar_domain->iommu_array, i, info)
4106 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4108 list_empty(&gather->freelist), 0);
4110 put_pages_list(&gather->freelist);
4113 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4116 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4117 struct dma_pte *pte;
4121 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4123 if (pte && dma_pte_present(pte))
4124 phys = dma_pte_addr(pte) +
4125 (iova & (BIT_MASK(level_to_offset_bits(level) +
4126 VTD_PAGE_SHIFT) - 1));
4131 static bool domain_support_force_snooping(struct dmar_domain *domain)
4133 struct device_domain_info *info;
4134 bool support = true;
4136 assert_spin_locked(&domain->lock);
4137 list_for_each_entry(info, &domain->devices, link) {
4138 if (!ecap_sc_support(info->iommu->ecap)) {
4147 static void domain_set_force_snooping(struct dmar_domain *domain)
4149 struct device_domain_info *info;
4151 assert_spin_locked(&domain->lock);
4153 * Second level page table supports per-PTE snoop control. The
4154 * iommu_map() interface will handle this by setting SNP bit.
4156 if (!domain->use_first_level) {
4157 domain->set_pte_snp = true;
4161 list_for_each_entry(info, &domain->devices, link)
4162 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4166 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4168 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4169 unsigned long flags;
4171 if (dmar_domain->force_snooping)
4174 spin_lock_irqsave(&dmar_domain->lock, flags);
4175 if (!domain_support_force_snooping(dmar_domain) ||
4176 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4177 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4181 domain_set_force_snooping(dmar_domain);
4182 dmar_domain->force_snooping = true;
4183 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4188 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4190 struct device_domain_info *info = dev_iommu_priv_get(dev);
4193 case IOMMU_CAP_CACHE_COHERENCY:
4194 case IOMMU_CAP_DEFERRED_FLUSH:
4196 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4197 return dmar_platform_optin();
4198 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4199 return ecap_sc_support(info->iommu->ecap);
4200 case IOMMU_CAP_DIRTY_TRACKING:
4201 return ssads_supported(info->iommu);
4207 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4209 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4210 struct device_domain_info *info;
4211 struct intel_iommu *iommu;
4215 iommu = device_lookup_iommu(dev, &bus, &devfn);
4216 if (!iommu || !iommu->iommu.ops)
4217 return ERR_PTR(-ENODEV);
4219 info = kzalloc(sizeof(*info), GFP_KERNEL);
4221 return ERR_PTR(-ENOMEM);
4223 if (dev_is_real_dma_subdevice(dev)) {
4224 info->bus = pdev->bus->number;
4225 info->devfn = pdev->devfn;
4226 info->segment = pci_domain_nr(pdev->bus);
4229 info->devfn = devfn;
4230 info->segment = iommu->segment;
4234 info->iommu = iommu;
4235 if (dev_is_pci(dev)) {
4236 if (ecap_dev_iotlb_support(iommu->ecap) &&
4237 pci_ats_supported(pdev) &&
4238 dmar_ats_supported(pdev, iommu)) {
4239 info->ats_supported = 1;
4240 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4243 * For IOMMU that supports device IOTLB throttling
4244 * (DIT), we assign PFSID to the invalidation desc
4245 * of a VF such that IOMMU HW can gauge queue depth
4246 * at PF level. If DIT is not set, PFSID will be
4247 * treated as reserved, which should be set to 0.
4249 if (ecap_dit(iommu->ecap))
4250 info->pfsid = pci_dev_id(pci_physfn(pdev));
4251 info->ats_qdep = pci_ats_queue_depth(pdev);
4253 if (sm_supported(iommu)) {
4254 if (pasid_supported(iommu)) {
4255 int features = pci_pasid_features(pdev);
4258 info->pasid_supported = features | 1;
4261 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4262 pci_pri_supported(pdev))
4263 info->pri_supported = 1;
4267 dev_iommu_priv_set(dev, info);
4269 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4270 ret = intel_pasid_alloc_table(dev);
4272 dev_err(dev, "PASID table allocation failed\n");
4274 return ERR_PTR(ret);
4278 intel_iommu_debugfs_create_dev(info);
4280 return &iommu->iommu;
4283 static void intel_iommu_release_device(struct device *dev)
4285 struct device_domain_info *info = dev_iommu_priv_get(dev);
4287 dmar_remove_one_dev_info(dev);
4288 intel_pasid_free_table(dev);
4289 intel_iommu_debugfs_remove_dev(info);
4291 set_dma_ops(dev, NULL);
4294 static void intel_iommu_probe_finalize(struct device *dev)
4296 set_dma_ops(dev, NULL);
4297 iommu_setup_dma_ops(dev, 0, U64_MAX);
4300 static void intel_iommu_get_resv_regions(struct device *device,
4301 struct list_head *head)
4303 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4304 struct iommu_resv_region *reg;
4305 struct dmar_rmrr_unit *rmrr;
4306 struct device *i_dev;
4310 for_each_rmrr_units(rmrr) {
4311 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4313 struct iommu_resv_region *resv;
4314 enum iommu_resv_type type;
4317 if (i_dev != device &&
4318 !is_downstream_to_pci_bridge(device, i_dev))
4321 length = rmrr->end_address - rmrr->base_address + 1;
4323 type = device_rmrr_is_relaxable(device) ?
4324 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4326 resv = iommu_alloc_resv_region(rmrr->base_address,
4332 list_add_tail(&resv->list, head);
4337 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4338 if (dev_is_pci(device)) {
4339 struct pci_dev *pdev = to_pci_dev(device);
4341 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4342 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4343 IOMMU_RESV_DIRECT_RELAXABLE,
4346 list_add_tail(®->list, head);
4349 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4351 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4352 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4353 0, IOMMU_RESV_MSI, GFP_KERNEL);
4356 list_add_tail(®->list, head);
4359 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4361 if (dev_is_pci(dev))
4362 return pci_device_group(dev);
4363 return generic_device_group(dev);
4366 static int intel_iommu_enable_sva(struct device *dev)
4368 struct device_domain_info *info = dev_iommu_priv_get(dev);
4369 struct intel_iommu *iommu;
4371 if (!info || dmar_disabled)
4374 iommu = info->iommu;
4378 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4381 if (!info->pasid_enabled || !info->ats_enabled)
4385 * Devices having device-specific I/O fault handling should not
4386 * support PCI/PRI. The IOMMU side has no means to check the
4387 * capability of device-specific IOPF. Therefore, IOMMU can only
4388 * default that if the device driver enables SVA on a non-PRI
4389 * device, it will handle IOPF in its own way.
4391 if (!info->pri_supported)
4394 /* Devices supporting PRI should have it enabled. */
4395 if (!info->pri_enabled)
4401 static int intel_iommu_enable_iopf(struct device *dev)
4403 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4404 struct device_domain_info *info = dev_iommu_priv_get(dev);
4405 struct intel_iommu *iommu;
4408 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4411 if (info->pri_enabled)
4414 iommu = info->iommu;
4418 /* PASID is required in PRG Response Message. */
4419 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4422 ret = pci_reset_pri(pdev);
4426 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4430 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4432 goto iopf_remove_device;
4434 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4436 goto iopf_unregister_handler;
4437 info->pri_enabled = 1;
4441 iopf_unregister_handler:
4442 iommu_unregister_device_fault_handler(dev);
4444 iopf_queue_remove_device(iommu->iopf_queue, dev);
4449 static int intel_iommu_disable_iopf(struct device *dev)
4451 struct device_domain_info *info = dev_iommu_priv_get(dev);
4452 struct intel_iommu *iommu = info->iommu;
4454 if (!info->pri_enabled)
4458 * PCIe spec states that by clearing PRI enable bit, the Page
4459 * Request Interface will not issue new page requests, but has
4460 * outstanding page requests that have been transmitted or are
4461 * queued for transmission. This is supposed to be called after
4462 * the device driver has stopped DMA, all PASIDs have been
4463 * unbound and the outstanding PRQs have been drained.
4465 pci_disable_pri(to_pci_dev(dev));
4466 info->pri_enabled = 0;
4469 * With PRI disabled and outstanding PRQs drained, unregistering
4470 * fault handler and removing device from iopf queue should never
4473 WARN_ON(iommu_unregister_device_fault_handler(dev));
4474 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4480 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4483 case IOMMU_DEV_FEAT_IOPF:
4484 return intel_iommu_enable_iopf(dev);
4486 case IOMMU_DEV_FEAT_SVA:
4487 return intel_iommu_enable_sva(dev);
4495 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4498 case IOMMU_DEV_FEAT_IOPF:
4499 return intel_iommu_disable_iopf(dev);
4501 case IOMMU_DEV_FEAT_SVA:
4509 static bool intel_iommu_is_attach_deferred(struct device *dev)
4511 struct device_domain_info *info = dev_iommu_priv_get(dev);
4513 return translation_pre_enabled(info->iommu) && !info->domain;
4517 * Check that the device does not live on an external facing PCI port that is
4518 * marked as untrusted. Such devices should not be able to apply quirks and
4519 * thus not be able to bypass the IOMMU restrictions.
4521 static bool risky_device(struct pci_dev *pdev)
4523 if (pdev->untrusted) {
4525 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4526 pdev->vendor, pdev->device);
4527 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4533 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4534 unsigned long iova, size_t size)
4536 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4537 unsigned long pages = aligned_nrpages(iova, size);
4538 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4539 struct iommu_domain_info *info;
4542 xa_for_each(&dmar_domain->iommu_array, i, info)
4543 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4547 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4549 struct device_domain_info *info = dev_iommu_priv_get(dev);
4550 struct dev_pasid_info *curr, *dev_pasid = NULL;
4551 struct intel_iommu *iommu = info->iommu;
4552 struct dmar_domain *dmar_domain;
4553 struct iommu_domain *domain;
4554 unsigned long flags;
4556 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4557 if (WARN_ON_ONCE(!domain))
4561 * The SVA implementation needs to handle its own stuffs like the mm
4562 * notification. Before consolidating that code into iommu core, let
4563 * the intel sva code handle it.
4565 if (domain->type == IOMMU_DOMAIN_SVA) {
4566 intel_svm_remove_dev_pasid(dev, pasid);
4570 dmar_domain = to_dmar_domain(domain);
4571 spin_lock_irqsave(&dmar_domain->lock, flags);
4572 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4573 if (curr->dev == dev && curr->pasid == pasid) {
4574 list_del(&curr->link_domain);
4579 WARN_ON_ONCE(!dev_pasid);
4580 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4582 domain_detach_iommu(dmar_domain, iommu);
4583 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4586 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4587 intel_drain_pasid_prq(dev, pasid);
4590 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4591 struct device *dev, ioasid_t pasid)
4593 struct device_domain_info *info = dev_iommu_priv_get(dev);
4594 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4595 struct intel_iommu *iommu = info->iommu;
4596 struct dev_pasid_info *dev_pasid;
4597 unsigned long flags;
4600 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4603 if (domain->dirty_ops)
4606 if (context_copied(iommu, info->bus, info->devfn))
4609 ret = prepare_domain_attach_device(domain, dev);
4613 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4617 ret = domain_attach_iommu(dmar_domain, iommu);
4621 if (domain_type_is_si(dmar_domain))
4622 ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4623 else if (dmar_domain->use_first_level)
4624 ret = domain_setup_first_level(iommu, dmar_domain,
4627 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4630 goto out_detach_iommu;
4632 dev_pasid->dev = dev;
4633 dev_pasid->pasid = pasid;
4634 spin_lock_irqsave(&dmar_domain->lock, flags);
4635 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4636 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4638 if (domain->type & __IOMMU_DOMAIN_PAGING)
4639 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4643 domain_detach_iommu(dmar_domain, iommu);
4649 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4651 struct device_domain_info *info = dev_iommu_priv_get(dev);
4652 struct intel_iommu *iommu = info->iommu;
4653 struct iommu_hw_info_vtd *vtd;
4655 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4657 return ERR_PTR(-ENOMEM);
4659 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4660 vtd->cap_reg = iommu->cap;
4661 vtd->ecap_reg = iommu->ecap;
4662 *length = sizeof(*vtd);
4663 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4667 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4670 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4671 struct device_domain_info *info;
4674 spin_lock(&dmar_domain->lock);
4675 if (dmar_domain->dirty_tracking == enable)
4678 list_for_each_entry(info, &dmar_domain->devices, link) {
4679 ret = intel_pasid_setup_dirty_tracking(info->iommu,
4680 info->domain, info->dev,
4681 IOMMU_NO_PASID, enable);
4686 dmar_domain->dirty_tracking = enable;
4688 spin_unlock(&dmar_domain->lock);
4693 list_for_each_entry(info, &dmar_domain->devices, link)
4694 intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
4695 info->dev, IOMMU_NO_PASID,
4696 dmar_domain->dirty_tracking);
4697 spin_unlock(&dmar_domain->lock);
4701 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4702 unsigned long iova, size_t size,
4703 unsigned long flags,
4704 struct iommu_dirty_bitmap *dirty)
4706 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4707 unsigned long end = iova + size - 1;
4708 unsigned long pgsize;
4711 * IOMMUFD core calls into a dirty tracking disabled domain without an
4712 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4713 * have occurred when we stopped dirty tracking. This ensures that we
4714 * never inherit dirtied bits from a previous cycle.
4716 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4720 struct dma_pte *pte;
4723 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4725 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4726 if (!pte || !dma_pte_present(pte)) {
4731 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4732 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4734 } while (iova < end);
4739 static const struct iommu_dirty_ops intel_dirty_ops = {
4740 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4741 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4744 const struct iommu_ops intel_iommu_ops = {
4745 .blocked_domain = &blocking_domain,
4746 .capable = intel_iommu_capable,
4747 .hw_info = intel_iommu_hw_info,
4748 .domain_alloc = intel_iommu_domain_alloc,
4749 .domain_alloc_user = intel_iommu_domain_alloc_user,
4750 .probe_device = intel_iommu_probe_device,
4751 .probe_finalize = intel_iommu_probe_finalize,
4752 .release_device = intel_iommu_release_device,
4753 .get_resv_regions = intel_iommu_get_resv_regions,
4754 .device_group = intel_iommu_device_group,
4755 .dev_enable_feat = intel_iommu_dev_enable_feat,
4756 .dev_disable_feat = intel_iommu_dev_disable_feat,
4757 .is_attach_deferred = intel_iommu_is_attach_deferred,
4758 .def_domain_type = device_def_domain_type,
4759 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4760 .pgsize_bitmap = SZ_4K,
4761 #ifdef CONFIG_INTEL_IOMMU_SVM
4762 .page_response = intel_svm_page_response,
4764 .default_domain_ops = &(const struct iommu_domain_ops) {
4765 .attach_dev = intel_iommu_attach_device,
4766 .set_dev_pasid = intel_iommu_set_dev_pasid,
4767 .map_pages = intel_iommu_map_pages,
4768 .unmap_pages = intel_iommu_unmap_pages,
4769 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4770 .flush_iotlb_all = intel_flush_iotlb_all,
4771 .iotlb_sync = intel_iommu_tlb_sync,
4772 .iova_to_phys = intel_iommu_iova_to_phys,
4773 .free = intel_iommu_domain_free,
4774 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4778 static void quirk_iommu_igfx(struct pci_dev *dev)
4780 if (risky_device(dev))
4783 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4787 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4796 /* Broadwell igfx malfunctions with dmar */
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4822 static void quirk_iommu_rwbf(struct pci_dev *dev)
4824 if (risky_device(dev))
4828 * Mobile 4 Series Chipset neglects to set RWBF capability,
4829 * but needs it. Same seems to hold for the desktop versions.
4831 pci_info(dev, "Forcing write-buffer flush capability\n");
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4844 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4845 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4846 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4847 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4848 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4849 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4850 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4851 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4853 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4857 if (risky_device(dev))
4860 if (pci_read_config_word(dev, GGC, &ggc))
4863 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4864 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4866 } else if (dmar_map_gfx) {
4867 /* we have to ensure the gfx device is idle before we flush */
4868 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4869 iommu_set_dma_strict();
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4877 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4881 if (!IS_GFX_DEVICE(dev))
4884 ver = (dev->device >> 8) & 0xff;
4885 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4886 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4887 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4890 if (risky_device(dev))
4893 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4894 iommu_skip_te_disable = 1;
4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4898 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4899 ISOCH DMAR unit for the Azalia sound device, but not give it any
4900 TLB entries, which causes it to deadlock. Check for that. We do
4901 this in a function called from init_dmars(), instead of in a PCI
4902 quirk, because we don't want to print the obnoxious "BIOS broken"
4903 message if VT-d is actually disabled.
4905 static void __init check_tylersburg_isoch(void)
4907 struct pci_dev *pdev;
4908 uint32_t vtisochctrl;
4910 /* If there's no Azalia in the system anyway, forget it. */
4911 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4915 if (risky_device(pdev)) {
4922 /* System Management Registers. Might be hidden, in which case
4923 we can't do the sanity check. But that's OK, because the
4924 known-broken BIOSes _don't_ actually hide it, so far. */
4925 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4929 if (risky_device(pdev)) {
4934 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4941 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4942 if (vtisochctrl & 1)
4945 /* Drop all bits other than the number of TLB entries */
4946 vtisochctrl &= 0x1c;
4948 /* If we have the recommended number of TLB entries (16), fine. */
4949 if (vtisochctrl == 0x10)
4952 /* Zero TLB entries? You get to ride the short bus to school. */
4954 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4955 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4956 dmi_get_system_info(DMI_BIOS_VENDOR),
4957 dmi_get_system_info(DMI_BIOS_VERSION),
4958 dmi_get_system_info(DMI_PRODUCT_VERSION));
4959 iommu_identity_mapping |= IDENTMAP_AZALIA;
4963 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4968 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4969 * invalidation completion before posted writes initiated with translated address
4970 * that utilized translations matching the invalidation address range, violating
4971 * the invalidation completion ordering.
4972 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4973 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4974 * under the control of the trusted/privileged host device driver must use this
4976 * Device TLBs are invalidated under the following six conditions:
4977 * 1. Device driver does DMA API unmap IOVA
4978 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4979 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4980 * exit_mmap() due to crash
4981 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4982 * VM has to free pages that were unmapped
4983 * 5. Userspace driver unmaps a DMA buffer
4984 * 6. Cache invalidation in vSVA usage (upcoming)
4986 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4987 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4988 * invalidate TLB the same way as normal user unmap which will use this quirk.
4989 * The dTLB invalidation after PASID cache flush does not need this quirk.
4991 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4993 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4994 unsigned long address, unsigned long mask,
4995 u32 pasid, u16 qdep)
4999 if (likely(!info->dtlb_extra_inval))
5002 sid = PCI_DEVID(info->bus, info->devfn);
5003 if (pasid == IOMMU_NO_PASID) {
5004 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5005 qdep, address, mask);
5007 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5008 pasid, qdep, address, mask);
5012 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5015 * Function to submit a command to the enhanced command interface. The
5016 * valid enhanced command descriptions are defined in Table 47 of the
5017 * VT-d spec. The VT-d hardware implementation may support some but not
5018 * all commands, which can be determined by checking the Enhanced
5019 * Command Capability Register.
5022 * - 0: Command successful without any error;
5023 * - Negative: software error value;
5024 * - Nonzero positive: failure status code defined in Table 48.
5026 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5028 unsigned long flags;
5032 if (!cap_ecmds(iommu->cap))
5035 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5037 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5038 if (res & DMA_ECMD_ECRSP_IP) {
5044 * Unconditionally write the operand B, because
5045 * - There is no side effect if an ecmd doesn't require an
5046 * operand B, but we set the register to some value.
5047 * - It's not invoked in any critical path. The extra MMIO
5048 * write doesn't bring any performance concerns.
5050 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5051 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5053 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5054 !(res & DMA_ECMD_ECRSP_IP), res);
5056 if (res & DMA_ECMD_ECRSP_IP) {
5061 ret = ecmd_get_status_code(res);
5063 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);