1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
27 #include "../dma-iommu.h"
28 #include "../irq_remapping.h"
29 #include "../iommu-sva.h"
31 #include "cap_audit.h"
34 #define ROOT_SIZE VTD_PAGE_SIZE
35 #define CONTEXT_SIZE VTD_PAGE_SIZE
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 #define IOAPIC_RANGE_START (0xfee00000)
43 #define IOAPIC_RANGE_END (0xfeefffff)
44 #define IOVA_START_ADDR (0x1000)
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN (1)
63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
65 /* page table handling */
66 #define LEVEL_STRIDE (9)
67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
69 static inline int agaw_to_level(int agaw)
74 static inline int agaw_to_width(int agaw)
76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
79 static inline int width_to_agaw(int width)
81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
84 static inline unsigned int level_to_offset_bits(int level)
86 return (level - 1) * LEVEL_STRIDE;
89 static inline int pfn_level_offset(u64 pfn, int level)
91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
94 static inline u64 level_mask(int level)
96 return -1ULL << level_to_offset_bits(level);
99 static inline u64 level_size(int level)
101 return 1ULL << level_to_offset_bits(level);
104 static inline u64 align_to_level(u64 pfn, int level)
106 return (pfn + level_size(level) - 1) & level_mask(level);
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115 are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
122 return mm_to_dma_pfn(page_to_pfn(pg));
124 static inline unsigned long virt_to_dma_pfn(void *p)
126 return page_to_dma_pfn(virt_to_page(p));
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
133 * set to 1 to panic kernel if can't successfully enable VT-d
134 * (used when kernel is launched w/ TXT)
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
143 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
151 return re->lo & VTD_PAGE_MASK;
155 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
163 return re->hi & VTD_PAGE_MASK;
166 static inline void context_set_present(struct context_entry *context)
171 static inline void context_set_fault_enable(struct context_entry *context)
173 context->lo &= (((u64)-1) << 2) | 1;
176 static inline void context_set_translation_type(struct context_entry *context,
179 context->lo &= (((u64)-1) << 4) | 3;
180 context->lo |= (value & 3) << 2;
183 static inline void context_set_address_root(struct context_entry *context,
186 context->lo &= ~VTD_PAGE_MASK;
187 context->lo |= value & VTD_PAGE_MASK;
190 static inline void context_set_address_width(struct context_entry *context,
193 context->hi |= value & 7;
196 static inline void context_set_domain_id(struct context_entry *context,
199 context->hi |= (value & ((1 << 16) - 1)) << 8;
202 static inline void context_set_pasid(struct context_entry *context)
204 context->lo |= CONTEXT_PASIDE;
207 static inline int context_domain_id(struct context_entry *c)
209 return((c->hi >> 8) & 0xffff);
212 static inline void context_clear_entry(struct context_entry *context)
218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
220 if (!iommu->copied_tables)
223 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
229 set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
235 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
239 * This domain is a statically identity mapping domain.
240 * 1. This domain creats a static 1:1 mapping to all usable memory.
241 * 2. It maps to each iommu if successful.
242 * 3. Each iommu mapps to this domain if successful.
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
247 struct dmar_rmrr_unit {
248 struct list_head list; /* list of rmrr units */
249 struct acpi_dmar_header *hdr; /* ACPI header */
250 u64 base_address; /* reserved base address*/
251 u64 end_address; /* reserved end address */
252 struct dmar_dev_scope *devices; /* target devices */
253 int devices_cnt; /* target device count */
256 struct dmar_atsr_unit {
257 struct list_head list; /* list of ATSR units */
258 struct acpi_dmar_header *hdr; /* ACPI header */
259 struct dmar_dev_scope *devices; /* target devices */
260 int devices_cnt; /* target device count */
261 u8 include_all:1; /* include all ports */
264 struct dmar_satc_unit {
265 struct list_head list; /* list of SATC units */
266 struct acpi_dmar_header *hdr; /* ACPI header */
267 struct dmar_dev_scope *devices; /* target devices */
268 struct intel_iommu *iommu; /* the corresponding iommu */
269 int devices_cnt; /* target device count */
270 u8 atc_required:1; /* ATS is required */
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
277 #define for_each_rmrr_units(rmrr) \
278 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
280 static void device_block_translation(struct device *dev);
281 static void intel_iommu_domain_free(struct iommu_domain *domain);
283 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
284 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
286 int intel_iommu_enabled = 0;
287 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
289 static int dmar_map_gfx = 1;
290 static int intel_iommu_superpage = 1;
291 static int iommu_identity_mapping;
292 static int iommu_skip_te_disable;
294 #define IDENTMAP_GFX 2
295 #define IDENTMAP_AZALIA 4
297 const struct iommu_ops intel_iommu_ops;
299 static bool translation_pre_enabled(struct intel_iommu *iommu)
301 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
304 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
306 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
309 static void init_translation_status(struct intel_iommu *iommu)
313 gsts = readl(iommu->reg + DMAR_GSTS_REG);
314 if (gsts & DMA_GSTS_TES)
315 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
318 static int __init intel_iommu_setup(char *str)
324 if (!strncmp(str, "on", 2)) {
326 pr_info("IOMMU enabled\n");
327 } else if (!strncmp(str, "off", 3)) {
329 no_platform_optin = 1;
330 pr_info("IOMMU disabled\n");
331 } else if (!strncmp(str, "igfx_off", 8)) {
333 pr_info("Disable GFX device mapping\n");
334 } else if (!strncmp(str, "forcedac", 8)) {
335 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
336 iommu_dma_forcedac = true;
337 } else if (!strncmp(str, "strict", 6)) {
338 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
339 iommu_set_dma_strict();
340 } else if (!strncmp(str, "sp_off", 6)) {
341 pr_info("Disable supported super page\n");
342 intel_iommu_superpage = 0;
343 } else if (!strncmp(str, "sm_on", 5)) {
344 pr_info("Enable scalable mode if hardware supports\n");
346 } else if (!strncmp(str, "sm_off", 6)) {
347 pr_info("Scalable mode is disallowed\n");
349 } else if (!strncmp(str, "tboot_noforce", 13)) {
350 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
351 intel_iommu_tboot_noforce = 1;
353 pr_notice("Unknown option - '%s'\n", str);
356 str += strcspn(str, ",");
363 __setup("intel_iommu=", intel_iommu_setup);
365 void *alloc_pgtable_page(int node, gfp_t gfp)
370 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
372 vaddr = page_address(page);
376 void free_pgtable_page(void *vaddr)
378 free_page((unsigned long)vaddr);
381 static inline int domain_type_is_si(struct dmar_domain *domain)
383 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
386 static inline int domain_pfn_supported(struct dmar_domain *domain,
389 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
391 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
395 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
396 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
397 * the returned SAGAW.
399 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
401 unsigned long fl_sagaw, sl_sagaw;
403 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
404 sl_sagaw = cap_sagaw(iommu->cap);
406 /* Second level only. */
407 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
410 /* First level only. */
411 if (!ecap_slts(iommu->ecap))
414 return fl_sagaw & sl_sagaw;
417 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
422 sagaw = __iommu_calculate_sagaw(iommu);
423 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
424 if (test_bit(agaw, &sagaw))
432 * Calculate max SAGAW for each iommu.
434 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
436 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
440 * calculate agaw for each iommu.
441 * "SAGAW" may be different across iommus, use a default agaw, and
442 * get a supported less agaw for iommus that don't support the default agaw.
444 int iommu_calculate_agaw(struct intel_iommu *iommu)
446 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
449 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
451 return sm_supported(iommu) ?
452 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
455 static void domain_update_iommu_coherency(struct dmar_domain *domain)
457 struct iommu_domain_info *info;
458 struct dmar_drhd_unit *drhd;
459 struct intel_iommu *iommu;
463 domain->iommu_coherency = true;
464 xa_for_each(&domain->iommu_array, i, info) {
466 if (!iommu_paging_structure_coherency(info->iommu)) {
467 domain->iommu_coherency = false;
474 /* No hardware attached; use lowest common denominator */
476 for_each_active_iommu(iommu, drhd) {
477 if (!iommu_paging_structure_coherency(iommu)) {
478 domain->iommu_coherency = false;
485 static int domain_update_iommu_superpage(struct dmar_domain *domain,
486 struct intel_iommu *skip)
488 struct dmar_drhd_unit *drhd;
489 struct intel_iommu *iommu;
492 if (!intel_iommu_superpage)
495 /* set iommu_superpage to the smallest common denominator */
497 for_each_active_iommu(iommu, drhd) {
499 if (domain && domain->use_first_level) {
500 if (!cap_fl1gp_support(iommu->cap))
503 mask &= cap_super_page_val(iommu->cap);
515 static int domain_update_device_node(struct dmar_domain *domain)
517 struct device_domain_info *info;
518 int nid = NUMA_NO_NODE;
521 spin_lock_irqsave(&domain->lock, flags);
522 list_for_each_entry(info, &domain->devices, link) {
524 * There could possibly be multiple device numa nodes as devices
525 * within the same domain may sit behind different IOMMUs. There
526 * isn't perfect answer in such situation, so we select first
527 * come first served policy.
529 nid = dev_to_node(info->dev);
530 if (nid != NUMA_NO_NODE)
533 spin_unlock_irqrestore(&domain->lock, flags);
538 static void domain_update_iotlb(struct dmar_domain *domain);
540 /* Return the super pagesize bitmap if supported. */
541 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
543 unsigned long bitmap = 0;
546 * 1-level super page supports page size of 2MiB, 2-level super page
547 * supports page size of both 2MiB and 1GiB.
549 if (domain->iommu_superpage == 1)
551 else if (domain->iommu_superpage == 2)
552 bitmap |= SZ_2M | SZ_1G;
557 /* Some capabilities may be different across iommus */
558 static void domain_update_iommu_cap(struct dmar_domain *domain)
560 domain_update_iommu_coherency(domain);
561 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
564 * If RHSA is missing, we should default to the device numa domain
567 if (domain->nid == NUMA_NO_NODE)
568 domain->nid = domain_update_device_node(domain);
571 * First-level translation restricts the input-address to a
572 * canonical address (i.e., address bits 63:N have the same
573 * value as address bit [N-1], where N is 48-bits with 4-level
574 * paging and 57-bits with 5-level paging). Hence, skip bit
577 if (domain->use_first_level)
578 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
580 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
582 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
583 domain_update_iotlb(domain);
586 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
589 struct root_entry *root = &iommu->root_entry[bus];
590 struct context_entry *context;
594 * Except that the caller requested to allocate a new entry,
595 * returning a copied context entry makes no sense.
597 if (!alloc && context_copied(iommu, bus, devfn))
601 if (sm_supported(iommu)) {
609 context = phys_to_virt(*entry & VTD_PAGE_MASK);
611 unsigned long phy_addr;
615 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
619 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
620 phy_addr = virt_to_phys((void *)context);
621 *entry = phy_addr | 1;
622 __iommu_flush_cache(iommu, entry, sizeof(*entry));
624 return &context[devfn];
628 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
629 * sub-hierarchy of a candidate PCI-PCI bridge
630 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
631 * @bridge: the candidate PCI-PCI bridge
633 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
636 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
638 struct pci_dev *pdev, *pbridge;
640 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
643 pdev = to_pci_dev(dev);
644 pbridge = to_pci_dev(bridge);
646 if (pbridge->subordinate &&
647 pbridge->subordinate->number <= pdev->bus->number &&
648 pbridge->subordinate->busn_res.end >= pdev->bus->number)
654 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
656 struct dmar_drhd_unit *drhd;
660 /* We know that this device on this chipset has its own IOMMU.
661 * If we find it under a different IOMMU, then the BIOS is lying
662 * to us. Hope that the IOMMU for this device is actually
663 * disabled, and it needs no translation...
665 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
668 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
673 /* we know that the this iommu should be at offset 0xa000 from vtbar */
674 drhd = dmar_find_matched_drhd_unit(pdev);
675 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
676 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
677 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
684 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
686 if (!iommu || iommu->drhd->ignored)
689 if (dev_is_pci(dev)) {
690 struct pci_dev *pdev = to_pci_dev(dev);
692 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
693 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
694 quirk_ioat_snb_local_iommu(pdev))
701 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
703 struct dmar_drhd_unit *drhd = NULL;
704 struct pci_dev *pdev = NULL;
705 struct intel_iommu *iommu;
713 if (dev_is_pci(dev)) {
714 struct pci_dev *pf_pdev;
716 pdev = pci_real_dma_dev(to_pci_dev(dev));
718 /* VFs aren't listed in scope tables; we need to look up
719 * the PF instead to find the IOMMU. */
720 pf_pdev = pci_physfn(pdev);
722 segment = pci_domain_nr(pdev->bus);
723 } else if (has_acpi_companion(dev))
724 dev = &ACPI_COMPANION(dev)->dev;
727 for_each_iommu(iommu, drhd) {
728 if (pdev && segment != drhd->segment)
731 for_each_active_dev_scope(drhd->devices,
732 drhd->devices_cnt, i, tmp) {
734 /* For a VF use its original BDF# not that of the PF
735 * which we used for the IOMMU lookup. Strictly speaking
736 * we could do this for all PCI devices; we only need to
737 * get the BDF# from the scope table for ACPI matches. */
738 if (pdev && pdev->is_virtfn)
742 *bus = drhd->devices[i].bus;
743 *devfn = drhd->devices[i].devfn;
748 if (is_downstream_to_pci_bridge(dev, tmp))
752 if (pdev && drhd->include_all) {
755 *bus = pdev->bus->number;
756 *devfn = pdev->devfn;
763 if (iommu_is_dummy(iommu, dev))
771 static void domain_flush_cache(struct dmar_domain *domain,
772 void *addr, int size)
774 if (!domain->iommu_coherency)
775 clflush_cache_range(addr, size);
778 static void free_context_table(struct intel_iommu *iommu)
780 struct context_entry *context;
783 if (!iommu->root_entry)
786 for (i = 0; i < ROOT_ENTRY_NR; i++) {
787 context = iommu_context_addr(iommu, i, 0, 0);
789 free_pgtable_page(context);
791 if (!sm_supported(iommu))
794 context = iommu_context_addr(iommu, i, 0x80, 0);
796 free_pgtable_page(context);
799 free_pgtable_page(iommu->root_entry);
800 iommu->root_entry = NULL;
803 #ifdef CONFIG_DMAR_DEBUG
804 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
805 u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 offset = pfn_level_offset(pfn, level);
812 pte = &parent[offset];
813 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
814 pr_info("PTE not present at level %d\n", level);
818 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
823 parent = phys_to_virt(dma_pte_addr(pte));
828 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
829 unsigned long long addr, u32 pasid)
831 struct pasid_dir_entry *dir, *pde;
832 struct pasid_entry *entries, *pte;
833 struct context_entry *ctx_entry;
834 struct root_entry *rt_entry;
835 int i, dir_index, index, level;
836 u8 devfn = source_id & 0xff;
837 u8 bus = source_id >> 8;
838 struct dma_pte *pgtable;
840 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
842 /* root entry dump */
843 rt_entry = &iommu->root_entry[bus];
845 pr_info("root table entry is not present\n");
849 if (sm_supported(iommu))
850 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
851 rt_entry->hi, rt_entry->lo);
853 pr_info("root entry: 0x%016llx", rt_entry->lo);
855 /* context entry dump */
856 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
858 pr_info("context table entry is not present\n");
862 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
863 ctx_entry->hi, ctx_entry->lo);
865 /* legacy mode does not require PASID entries */
866 if (!sm_supported(iommu)) {
867 level = agaw_to_level(ctx_entry->hi & 7);
868 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
872 /* get the pointer to pasid directory entry */
873 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
875 pr_info("pasid directory entry is not present\n");
878 /* For request-without-pasid, get the pasid from context entry */
879 if (intel_iommu_sm && pasid == INVALID_IOASID)
880 pasid = PASID_RID2PASID;
882 dir_index = pasid >> PASID_PDE_SHIFT;
883 pde = &dir[dir_index];
884 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
886 /* get the pointer to the pasid table entry */
887 entries = get_pasid_table_from_pde(pde);
889 pr_info("pasid table entry is not present\n");
892 index = pasid & PASID_PTE_MASK;
893 pte = &entries[index];
894 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
895 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
897 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
898 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
899 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
901 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
902 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
906 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 unsigned long pfn, int *target_level,
914 struct dma_pte *parent, *pte;
915 int level = agaw_to_level(domain->agaw);
918 BUG_ON(!domain->pgd);
920 if (!domain_pfn_supported(domain, pfn))
921 /* Address beyond IOMMU's addressing capabilities. */
924 parent = domain->pgd;
929 offset = pfn_level_offset(pfn, level);
930 pte = &parent[offset];
931 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
933 if (level == *target_level)
936 if (!dma_pte_present(pte)) {
939 tmp_page = alloc_pgtable_page(domain->nid, gfp);
944 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
945 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
946 if (domain->use_first_level)
947 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
949 if (cmpxchg64(&pte->val, 0ULL, pteval))
950 /* Someone else set it while we were thinking; use theirs. */
951 free_pgtable_page(tmp_page);
953 domain_flush_cache(domain, pte, sizeof(*pte));
958 parent = phys_to_virt(dma_pte_addr(pte));
963 *target_level = level;
968 /* return address's pte at specific level */
969 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
971 int level, int *large_page)
973 struct dma_pte *parent, *pte;
974 int total = agaw_to_level(domain->agaw);
977 parent = domain->pgd;
978 while (level <= total) {
979 offset = pfn_level_offset(pfn, total);
980 pte = &parent[offset];
984 if (!dma_pte_present(pte)) {
989 if (dma_pte_superpage(pte)) {
994 parent = phys_to_virt(dma_pte_addr(pte));
1000 /* clear last level pte, a tlb flush should be followed */
1001 static void dma_pte_clear_range(struct dmar_domain *domain,
1002 unsigned long start_pfn,
1003 unsigned long last_pfn)
1005 unsigned int large_page;
1006 struct dma_pte *first_pte, *pte;
1008 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1009 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1010 BUG_ON(start_pfn > last_pfn);
1012 /* we don't need lock here; nobody else touches the iova range */
1015 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1017 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1022 start_pfn += lvl_to_nr_pages(large_page);
1024 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1026 domain_flush_cache(domain, first_pte,
1027 (void *)pte - (void *)first_pte);
1029 } while (start_pfn && start_pfn <= last_pfn);
1032 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1033 int retain_level, struct dma_pte *pte,
1034 unsigned long pfn, unsigned long start_pfn,
1035 unsigned long last_pfn)
1037 pfn = max(start_pfn, pfn);
1038 pte = &pte[pfn_level_offset(pfn, level)];
1041 unsigned long level_pfn;
1042 struct dma_pte *level_pte;
1044 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1047 level_pfn = pfn & level_mask(level);
1048 level_pte = phys_to_virt(dma_pte_addr(pte));
1051 dma_pte_free_level(domain, level - 1, retain_level,
1052 level_pte, level_pfn, start_pfn,
1057 * Free the page table if we're below the level we want to
1058 * retain and the range covers the entire table.
1060 if (level < retain_level && !(start_pfn > level_pfn ||
1061 last_pfn < level_pfn + level_size(level) - 1)) {
1063 domain_flush_cache(domain, pte, sizeof(*pte));
1064 free_pgtable_page(level_pte);
1067 pfn += level_size(level);
1068 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 * clear last level (leaf) ptes and free page table pages below the
1073 * level we wish to keep intact.
1075 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1076 unsigned long start_pfn,
1077 unsigned long last_pfn,
1080 dma_pte_clear_range(domain, start_pfn, last_pfn);
1082 /* We don't need lock here; nobody else touches the iova range */
1083 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1084 domain->pgd, 0, start_pfn, last_pfn);
1087 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1088 free_pgtable_page(domain->pgd);
1093 /* When a page at a given level is being unlinked from its parent, we don't
1094 need to *modify* it at all. All we need to do is make a list of all the
1095 pages which can be freed just as soon as we've flushed the IOTLB and we
1096 know the hardware page-walk will no longer touch them.
1097 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1099 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1100 int level, struct dma_pte *pte,
1101 struct list_head *freelist)
1105 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1106 list_add_tail(&pg->lru, freelist);
1111 pte = page_address(pg);
1113 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1114 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1116 } while (!first_pte_in_page(pte));
1119 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1120 struct dma_pte *pte, unsigned long pfn,
1121 unsigned long start_pfn, unsigned long last_pfn,
1122 struct list_head *freelist)
1124 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1126 pfn = max(start_pfn, pfn);
1127 pte = &pte[pfn_level_offset(pfn, level)];
1130 unsigned long level_pfn = pfn & level_mask(level);
1132 if (!dma_pte_present(pte))
1135 /* If range covers entire pagetable, free it */
1136 if (start_pfn <= level_pfn &&
1137 last_pfn >= level_pfn + level_size(level) - 1) {
1138 /* These suborbinate page tables are going away entirely. Don't
1139 bother to clear them; we're just going to *free* them. */
1140 if (level > 1 && !dma_pte_superpage(pte))
1141 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1147 } else if (level > 1) {
1148 /* Recurse down into a level that isn't *entirely* obsolete */
1149 dma_pte_clear_level(domain, level - 1,
1150 phys_to_virt(dma_pte_addr(pte)),
1151 level_pfn, start_pfn, last_pfn,
1155 pfn = level_pfn + level_size(level);
1156 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1159 domain_flush_cache(domain, first_pte,
1160 (void *)++last_pte - (void *)first_pte);
1163 /* We can't just free the pages because the IOMMU may still be walking
1164 the page tables, and may have cached the intermediate levels. The
1165 pages can only be freed after the IOTLB flush has been done. */
1166 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1167 unsigned long last_pfn, struct list_head *freelist)
1169 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1170 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1171 BUG_ON(start_pfn > last_pfn);
1173 /* we don't need lock here; nobody else touches the iova range */
1174 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1175 domain->pgd, 0, start_pfn, last_pfn, freelist);
1178 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1179 struct page *pgd_page = virt_to_page(domain->pgd);
1180 list_add_tail(&pgd_page->lru, freelist);
1185 /* iommu handling */
1186 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1188 struct root_entry *root;
1190 root = (struct root_entry *)alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1192 pr_err("Allocating root entry for %s failed\n",
1197 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1198 iommu->root_entry = root;
1203 static void iommu_set_root_entry(struct intel_iommu *iommu)
1209 addr = virt_to_phys(iommu->root_entry);
1210 if (sm_supported(iommu))
1211 addr |= DMA_RTADDR_SMT;
1213 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1214 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1216 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1218 /* Make sure hardware complete it */
1219 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1220 readl, (sts & DMA_GSTS_RTPS), sts);
1222 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1225 * Hardware invalidates all DMA remapping hardware translation
1226 * caches as part of SRTP flow.
1228 if (cap_esrtps(iommu->cap))
1231 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1232 if (sm_supported(iommu))
1233 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1234 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1237 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1242 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1245 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1246 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1248 /* Make sure hardware complete it */
1249 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1250 readl, (!(val & DMA_GSTS_WBFS)), val);
1252 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1255 /* return value determine if we need a write buffer flush */
1256 static void __iommu_flush_context(struct intel_iommu *iommu,
1257 u16 did, u16 source_id, u8 function_mask,
1264 case DMA_CCMD_GLOBAL_INVL:
1265 val = DMA_CCMD_GLOBAL_INVL;
1267 case DMA_CCMD_DOMAIN_INVL:
1268 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1270 case DMA_CCMD_DEVICE_INVL:
1271 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1272 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1277 val |= DMA_CCMD_ICC;
1279 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1280 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1282 /* Make sure hardware complete it */
1283 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1284 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1286 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1289 /* return value determine if we need a write buffer flush */
1290 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1291 u64 addr, unsigned int size_order, u64 type)
1293 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1294 u64 val = 0, val_iva = 0;
1298 case DMA_TLB_GLOBAL_FLUSH:
1299 /* global flush doesn't need set IVA_REG */
1300 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1302 case DMA_TLB_DSI_FLUSH:
1303 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1305 case DMA_TLB_PSI_FLUSH:
1306 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1307 /* IH bit is passed in as part of address */
1308 val_iva = size_order | addr;
1313 /* Note: set drain read/write */
1316 * This is probably to be super secure.. Looks like we can
1317 * ignore it without any impact.
1319 if (cap_read_drain(iommu->cap))
1320 val |= DMA_TLB_READ_DRAIN;
1322 if (cap_write_drain(iommu->cap))
1323 val |= DMA_TLB_WRITE_DRAIN;
1325 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1326 /* Note: Only uses first TLB reg currently */
1328 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1329 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1331 /* Make sure hardware complete it */
1332 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1333 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1335 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1337 /* check IOTLB invalidation granularity */
1338 if (DMA_TLB_IAIG(val) == 0)
1339 pr_err("Flush IOTLB failed\n");
1340 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1341 pr_debug("TLB flush request %Lx, actual %Lx\n",
1342 (unsigned long long)DMA_TLB_IIRG(type),
1343 (unsigned long long)DMA_TLB_IAIG(val));
1346 static struct device_domain_info *
1347 domain_lookup_dev_info(struct dmar_domain *domain,
1348 struct intel_iommu *iommu, u8 bus, u8 devfn)
1350 struct device_domain_info *info;
1351 unsigned long flags;
1353 spin_lock_irqsave(&domain->lock, flags);
1354 list_for_each_entry(info, &domain->devices, link) {
1355 if (info->iommu == iommu && info->bus == bus &&
1356 info->devfn == devfn) {
1357 spin_unlock_irqrestore(&domain->lock, flags);
1361 spin_unlock_irqrestore(&domain->lock, flags);
1366 static void domain_update_iotlb(struct dmar_domain *domain)
1368 struct device_domain_info *info;
1369 bool has_iotlb_device = false;
1370 unsigned long flags;
1372 spin_lock_irqsave(&domain->lock, flags);
1373 list_for_each_entry(info, &domain->devices, link) {
1374 if (info->ats_enabled) {
1375 has_iotlb_device = true;
1379 domain->has_iotlb_device = has_iotlb_device;
1380 spin_unlock_irqrestore(&domain->lock, flags);
1384 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1385 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1386 * check because it applies only to the built-in QAT devices and it doesn't
1387 * grant additional privileges.
1389 #define BUGGY_QAT_DEVID_MASK 0x4940
1390 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1392 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1395 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1401 static void iommu_enable_pci_caps(struct device_domain_info *info)
1403 struct pci_dev *pdev;
1405 if (!dev_is_pci(info->dev))
1408 pdev = to_pci_dev(info->dev);
1409 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1410 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1411 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1412 * reserved, which should be set to 0.
1414 if (!ecap_dit(info->iommu->ecap))
1417 struct pci_dev *pf_pdev;
1419 /* pdev will be returned if device is not a vf */
1420 pf_pdev = pci_physfn(pdev);
1421 info->pfsid = pci_dev_id(pf_pdev);
1424 /* The PCIe spec, in its wisdom, declares that the behaviour of
1425 the device if you enable PASID support after ATS support is
1426 undefined. So always enable PASID support on devices which
1427 have it, even if we can't yet know if we're ever going to
1429 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1430 info->pasid_enabled = 1;
1432 if (info->pri_supported &&
1433 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1434 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1435 info->pri_enabled = 1;
1437 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1438 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1439 info->ats_enabled = 1;
1440 domain_update_iotlb(info->domain);
1441 info->ats_qdep = pci_ats_queue_depth(pdev);
1445 static void iommu_disable_pci_caps(struct device_domain_info *info)
1447 struct pci_dev *pdev;
1449 if (!dev_is_pci(info->dev))
1452 pdev = to_pci_dev(info->dev);
1454 if (info->ats_enabled) {
1455 pci_disable_ats(pdev);
1456 info->ats_enabled = 0;
1457 domain_update_iotlb(info->domain);
1460 if (info->pri_enabled) {
1461 pci_disable_pri(pdev);
1462 info->pri_enabled = 0;
1465 if (info->pasid_enabled) {
1466 pci_disable_pasid(pdev);
1467 info->pasid_enabled = 0;
1471 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1472 u64 addr, unsigned int mask)
1476 if (!info || !info->ats_enabled)
1479 sid = info->bus << 8 | info->devfn;
1480 qdep = info->ats_qdep;
1481 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1483 quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1486 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1487 u64 addr, unsigned mask)
1489 struct device_domain_info *info;
1490 unsigned long flags;
1492 if (!domain->has_iotlb_device)
1495 spin_lock_irqsave(&domain->lock, flags);
1496 list_for_each_entry(info, &domain->devices, link)
1497 __iommu_flush_dev_iotlb(info, addr, mask);
1498 spin_unlock_irqrestore(&domain->lock, flags);
1501 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1502 struct dmar_domain *domain,
1503 unsigned long pfn, unsigned int pages,
1506 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1507 unsigned int mask = ilog2(aligned_pages);
1508 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1509 u16 did = domain_id_iommu(domain, iommu);
1516 if (domain->use_first_level) {
1517 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1519 unsigned long bitmask = aligned_pages - 1;
1522 * PSI masks the low order bits of the base address. If the
1523 * address isn't aligned to the mask, then compute a mask value
1524 * needed to ensure the target range is flushed.
1526 if (unlikely(bitmask & pfn)) {
1527 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1530 * Since end_pfn <= pfn + bitmask, the only way bits
1531 * higher than bitmask can differ in pfn and end_pfn is
1532 * by carrying. This means after masking out bitmask,
1533 * high bits starting with the first set bit in
1534 * shared_bits are all equal in both pfn and end_pfn.
1536 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1537 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1541 * Fallback to domain selective flush if no PSI support or
1542 * the size is too big.
1544 if (!cap_pgsel_inv(iommu->cap) ||
1545 mask > cap_max_amask_val(iommu->cap))
1546 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1549 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1554 * In caching mode, changes of pages from non-present to present require
1555 * flush. However, device IOTLB doesn't need to be flushed in this case.
1557 if (!cap_caching_mode(iommu->cap) || !map)
1558 iommu_flush_dev_iotlb(domain, addr, mask);
1561 /* Notification for newly created mappings */
1562 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1563 struct dmar_domain *domain,
1564 unsigned long pfn, unsigned int pages)
1567 * It's a non-present to present mapping. Only flush if caching mode
1570 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1571 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1573 iommu_flush_write_buffer(iommu);
1576 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1578 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1579 struct iommu_domain_info *info;
1582 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1583 struct intel_iommu *iommu = info->iommu;
1584 u16 did = domain_id_iommu(dmar_domain, iommu);
1586 if (dmar_domain->use_first_level)
1587 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1589 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1592 if (!cap_caching_mode(iommu->cap))
1593 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1597 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1600 unsigned long flags;
1602 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1605 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1606 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1607 pmen &= ~DMA_PMEN_EPM;
1608 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1610 /* wait for the protected region status bit to clear */
1611 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1612 readl, !(pmen & DMA_PMEN_PRS), pmen);
1614 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1617 static void iommu_enable_translation(struct intel_iommu *iommu)
1620 unsigned long flags;
1622 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1623 iommu->gcmd |= DMA_GCMD_TE;
1624 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1626 /* Make sure hardware complete it */
1627 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1628 readl, (sts & DMA_GSTS_TES), sts);
1630 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1633 static void iommu_disable_translation(struct intel_iommu *iommu)
1638 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1639 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1642 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1643 iommu->gcmd &= ~DMA_GCMD_TE;
1644 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1646 /* Make sure hardware complete it */
1647 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1648 readl, (!(sts & DMA_GSTS_TES)), sts);
1650 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1653 static int iommu_init_domains(struct intel_iommu *iommu)
1657 ndomains = cap_ndoms(iommu->cap);
1658 pr_debug("%s: Number of Domains supported <%d>\n",
1659 iommu->name, ndomains);
1661 spin_lock_init(&iommu->lock);
1663 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1664 if (!iommu->domain_ids)
1668 * If Caching mode is set, then invalid translations are tagged
1669 * with domain-id 0, hence we need to pre-allocate it. We also
1670 * use domain-id 0 as a marker for non-allocated domain-id, so
1671 * make sure it is not used for a real domain.
1673 set_bit(0, iommu->domain_ids);
1676 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1677 * entry for first-level or pass-through translation modes should
1678 * be programmed with a domain id different from those used for
1679 * second-level or nested translation. We reserve a domain id for
1682 if (sm_supported(iommu))
1683 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1688 static void disable_dmar_iommu(struct intel_iommu *iommu)
1690 if (!iommu->domain_ids)
1694 * All iommu domains must have been detached from the devices,
1695 * hence there should be no domain IDs in use.
1697 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1698 > NUM_RESERVED_DID))
1701 if (iommu->gcmd & DMA_GCMD_TE)
1702 iommu_disable_translation(iommu);
1705 static void free_dmar_iommu(struct intel_iommu *iommu)
1707 if (iommu->domain_ids) {
1708 bitmap_free(iommu->domain_ids);
1709 iommu->domain_ids = NULL;
1712 if (iommu->copied_tables) {
1713 bitmap_free(iommu->copied_tables);
1714 iommu->copied_tables = NULL;
1717 /* free context mapping */
1718 free_context_table(iommu);
1720 #ifdef CONFIG_INTEL_IOMMU_SVM
1721 if (pasid_supported(iommu)) {
1722 if (ecap_prs(iommu->ecap))
1723 intel_svm_finish_prq(iommu);
1725 if (vccap_pasid(iommu->vccap))
1726 ioasid_unregister_allocator(&iommu->pasid_allocator);
1732 * Check and return whether first level is used by default for
1735 static bool first_level_by_default(unsigned int type)
1737 /* Only SL is available in legacy mode */
1738 if (!scalable_mode_support())
1741 /* Only level (either FL or SL) is available, just use it */
1742 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1743 return intel_cap_flts_sanity();
1745 /* Both levels are available, decide it based on domain type */
1746 return type != IOMMU_DOMAIN_UNMANAGED;
1749 static struct dmar_domain *alloc_domain(unsigned int type)
1751 struct dmar_domain *domain;
1753 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1757 domain->nid = NUMA_NO_NODE;
1758 if (first_level_by_default(type))
1759 domain->use_first_level = true;
1760 domain->has_iotlb_device = false;
1761 INIT_LIST_HEAD(&domain->devices);
1762 spin_lock_init(&domain->lock);
1763 xa_init(&domain->iommu_array);
1768 static int domain_attach_iommu(struct dmar_domain *domain,
1769 struct intel_iommu *iommu)
1771 struct iommu_domain_info *info, *curr;
1772 unsigned long ndomains;
1773 int num, ret = -ENOSPC;
1775 info = kzalloc(sizeof(*info), GFP_KERNEL);
1779 spin_lock(&iommu->lock);
1780 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1783 spin_unlock(&iommu->lock);
1788 ndomains = cap_ndoms(iommu->cap);
1789 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1790 if (num >= ndomains) {
1791 pr_err("%s: No free domain ids\n", iommu->name);
1795 set_bit(num, iommu->domain_ids);
1798 info->iommu = iommu;
1799 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1800 NULL, info, GFP_ATOMIC);
1802 ret = xa_err(curr) ? : -EBUSY;
1805 domain_update_iommu_cap(domain);
1807 spin_unlock(&iommu->lock);
1811 clear_bit(info->did, iommu->domain_ids);
1813 spin_unlock(&iommu->lock);
1818 static void domain_detach_iommu(struct dmar_domain *domain,
1819 struct intel_iommu *iommu)
1821 struct iommu_domain_info *info;
1823 spin_lock(&iommu->lock);
1824 info = xa_load(&domain->iommu_array, iommu->seq_id);
1825 if (--info->refcnt == 0) {
1826 clear_bit(info->did, iommu->domain_ids);
1827 xa_erase(&domain->iommu_array, iommu->seq_id);
1828 domain->nid = NUMA_NO_NODE;
1829 domain_update_iommu_cap(domain);
1832 spin_unlock(&iommu->lock);
1835 static inline int guestwidth_to_adjustwidth(int gaw)
1838 int r = (gaw - 12) % 9;
1849 static void domain_exit(struct dmar_domain *domain)
1852 LIST_HEAD(freelist);
1854 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1855 put_pages_list(&freelist);
1858 if (WARN_ON(!list_empty(&domain->devices)))
1865 * Get the PASID directory size for scalable mode context entry.
1866 * Value of X in the PDTS field of a scalable mode context entry
1867 * indicates PASID directory with 2^(X + 7) entries.
1869 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1871 unsigned long pds, max_pde;
1873 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1874 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1882 * Set the RID_PASID field of a scalable mode context entry. The
1883 * IOMMU hardware will use the PASID value set in this field for
1884 * DMA translations of DMA requests without PASID.
1887 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1889 context->hi |= pasid & ((1 << 20) - 1);
1893 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1896 static inline void context_set_sm_dte(struct context_entry *context)
1898 context->lo |= (1 << 2);
1902 * Set the PRE(Page Request Enable) field of a scalable mode context
1905 static inline void context_set_sm_pre(struct context_entry *context)
1907 context->lo |= (1 << 4);
1910 /* Convert value to context PASID directory size field coding. */
1911 #define context_pdts(pds) (((pds) & 0x7) << 9)
1913 static int domain_context_mapping_one(struct dmar_domain *domain,
1914 struct intel_iommu *iommu,
1915 struct pasid_table *table,
1918 struct device_domain_info *info =
1919 domain_lookup_dev_info(domain, iommu, bus, devfn);
1920 u16 did = domain_id_iommu(domain, iommu);
1921 int translation = CONTEXT_TT_MULTI_LEVEL;
1922 struct context_entry *context;
1927 if (hw_pass_through && domain_type_is_si(domain))
1928 translation = CONTEXT_TT_PASS_THROUGH;
1930 pr_debug("Set context mapping for %02x:%02x.%d\n",
1931 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1933 BUG_ON(!domain->pgd);
1935 spin_lock(&iommu->lock);
1937 context = iommu_context_addr(iommu, bus, devfn, 1);
1942 if (context_present(context) && !context_copied(iommu, bus, devfn))
1946 * For kdump cases, old valid entries may be cached due to the
1947 * in-flight DMA and copied pgtable, but there is no unmapping
1948 * behaviour for them, thus we need an explicit cache flush for
1949 * the newly-mapped device. For kdump, at this point, the device
1950 * is supposed to finish reset at its driver probe stage, so no
1951 * in-flight DMA will exist, and we don't need to worry anymore
1954 if (context_copied(iommu, bus, devfn)) {
1955 u16 did_old = context_domain_id(context);
1957 if (did_old < cap_ndoms(iommu->cap)) {
1958 iommu->flush.flush_context(iommu, did_old,
1959 (((u16)bus) << 8) | devfn,
1960 DMA_CCMD_MASK_NOBIT,
1961 DMA_CCMD_DEVICE_INVL);
1962 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1966 clear_context_copied(iommu, bus, devfn);
1969 context_clear_entry(context);
1971 if (sm_supported(iommu)) {
1976 /* Setup the PASID DIR pointer: */
1977 pds = context_get_sm_pds(table);
1978 context->lo = (u64)virt_to_phys(table->table) |
1981 /* Setup the RID_PASID field: */
1982 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1985 * Setup the Device-TLB enable bit and Page request
1988 if (info && info->ats_supported)
1989 context_set_sm_dte(context);
1990 if (info && info->pri_supported)
1991 context_set_sm_pre(context);
1992 if (info && info->pasid_supported)
1993 context_set_pasid(context);
1995 struct dma_pte *pgd = domain->pgd;
1998 context_set_domain_id(context, did);
2000 if (translation != CONTEXT_TT_PASS_THROUGH) {
2002 * Skip top levels of page tables for iommu which has
2003 * less agaw than default. Unnecessary for PT mode.
2005 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2007 pgd = phys_to_virt(dma_pte_addr(pgd));
2008 if (!dma_pte_present(pgd))
2012 if (info && info->ats_supported)
2013 translation = CONTEXT_TT_DEV_IOTLB;
2015 translation = CONTEXT_TT_MULTI_LEVEL;
2017 context_set_address_root(context, virt_to_phys(pgd));
2018 context_set_address_width(context, agaw);
2021 * In pass through mode, AW must be programmed to
2022 * indicate the largest AGAW value supported by
2023 * hardware. And ASR is ignored by hardware.
2025 context_set_address_width(context, iommu->msagaw);
2028 context_set_translation_type(context, translation);
2031 context_set_fault_enable(context);
2032 context_set_present(context);
2033 if (!ecap_coherent(iommu->ecap))
2034 clflush_cache_range(context, sizeof(*context));
2037 * It's a non-present to present mapping. If hardware doesn't cache
2038 * non-present entry we only need to flush the write-buffer. If the
2039 * _does_ cache non-present entries, then it does so in the special
2040 * domain #0, which we have to flush:
2042 if (cap_caching_mode(iommu->cap)) {
2043 iommu->flush.flush_context(iommu, 0,
2044 (((u16)bus) << 8) | devfn,
2045 DMA_CCMD_MASK_NOBIT,
2046 DMA_CCMD_DEVICE_INVL);
2047 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2049 iommu_flush_write_buffer(iommu);
2055 spin_unlock(&iommu->lock);
2060 struct domain_context_mapping_data {
2061 struct dmar_domain *domain;
2062 struct intel_iommu *iommu;
2063 struct pasid_table *table;
2066 static int domain_context_mapping_cb(struct pci_dev *pdev,
2067 u16 alias, void *opaque)
2069 struct domain_context_mapping_data *data = opaque;
2071 return domain_context_mapping_one(data->domain, data->iommu,
2072 data->table, PCI_BUS_NUM(alias),
2077 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2079 struct domain_context_mapping_data data;
2080 struct pasid_table *table;
2081 struct intel_iommu *iommu;
2084 iommu = device_to_iommu(dev, &bus, &devfn);
2088 table = intel_pasid_get_table(dev);
2090 if (!dev_is_pci(dev))
2091 return domain_context_mapping_one(domain, iommu, table,
2094 data.domain = domain;
2098 return pci_for_each_dma_alias(to_pci_dev(dev),
2099 &domain_context_mapping_cb, &data);
2102 /* Returns a number of VTD pages, but aligned to MM page size */
2103 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2106 host_addr &= ~PAGE_MASK;
2107 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2110 /* Return largest possible superpage level for a given mapping */
2111 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2112 unsigned long iov_pfn,
2113 unsigned long phy_pfn,
2114 unsigned long pages)
2116 int support, level = 1;
2117 unsigned long pfnmerge;
2119 support = domain->iommu_superpage;
2121 /* To use a large page, the virtual *and* physical addresses
2122 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2123 of them will mean we have to use smaller pages. So just
2124 merge them and check both at once. */
2125 pfnmerge = iov_pfn | phy_pfn;
2127 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2128 pages >>= VTD_STRIDE_SHIFT;
2131 pfnmerge >>= VTD_STRIDE_SHIFT;
2139 * Ensure that old small page tables are removed to make room for superpage(s).
2140 * We're going to add new large pages, so make sure we don't remove their parent
2141 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2143 static void switch_to_super_page(struct dmar_domain *domain,
2144 unsigned long start_pfn,
2145 unsigned long end_pfn, int level)
2147 unsigned long lvl_pages = lvl_to_nr_pages(level);
2148 struct iommu_domain_info *info;
2149 struct dma_pte *pte = NULL;
2152 while (start_pfn <= end_pfn) {
2154 pte = pfn_to_dma_pte(domain, start_pfn, &level,
2157 if (dma_pte_present(pte)) {
2158 dma_pte_free_pagetable(domain, start_pfn,
2159 start_pfn + lvl_pages - 1,
2162 xa_for_each(&domain->iommu_array, i, info)
2163 iommu_flush_iotlb_psi(info->iommu, domain,
2164 start_pfn, lvl_pages,
2169 start_pfn += lvl_pages;
2170 if (first_pte_in_page(pte))
2176 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2177 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2180 struct dma_pte *first_pte = NULL, *pte = NULL;
2181 unsigned int largepage_lvl = 0;
2182 unsigned long lvl_pages = 0;
2186 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2188 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2191 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2192 attr |= DMA_FL_PTE_PRESENT;
2193 if (domain->use_first_level) {
2194 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2195 if (prot & DMA_PTE_WRITE)
2196 attr |= DMA_FL_PTE_DIRTY;
2199 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2201 while (nr_pages > 0) {
2205 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2206 phys_pfn, nr_pages);
2208 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2214 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2216 /* It is large page*/
2217 if (largepage_lvl > 1) {
2218 unsigned long end_pfn;
2219 unsigned long pages_to_remove;
2221 pteval |= DMA_PTE_LARGE_PAGE;
2222 pages_to_remove = min_t(unsigned long, nr_pages,
2223 nr_pte_to_next_page(pte) * lvl_pages);
2224 end_pfn = iov_pfn + pages_to_remove - 1;
2225 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2227 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2231 /* We don't need lock here, nobody else
2232 * touches the iova range
2234 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2236 static int dumps = 5;
2237 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2238 iov_pfn, tmp, (unsigned long long)pteval);
2241 debug_dma_dump_mappings(NULL);
2246 nr_pages -= lvl_pages;
2247 iov_pfn += lvl_pages;
2248 phys_pfn += lvl_pages;
2249 pteval += lvl_pages * VTD_PAGE_SIZE;
2251 /* If the next PTE would be the first in a new page, then we
2252 * need to flush the cache on the entries we've just written.
2253 * And then we'll need to recalculate 'pte', so clear it and
2254 * let it get set again in the if (!pte) block above.
2256 * If we're done (!nr_pages) we need to flush the cache too.
2258 * Also if we've been setting superpages, we may need to
2259 * recalculate 'pte' and switch back to smaller pages for the
2260 * end of the mapping, if the trailing size is not enough to
2261 * use another superpage (i.e. nr_pages < lvl_pages).
2264 if (!nr_pages || first_pte_in_page(pte) ||
2265 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2266 domain_flush_cache(domain, first_pte,
2267 (void *)pte - (void *)first_pte);
2275 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2277 struct intel_iommu *iommu = info->iommu;
2278 struct context_entry *context;
2284 spin_lock(&iommu->lock);
2285 context = iommu_context_addr(iommu, bus, devfn, 0);
2287 spin_unlock(&iommu->lock);
2291 if (sm_supported(iommu)) {
2292 if (hw_pass_through && domain_type_is_si(info->domain))
2293 did_old = FLPT_DEFAULT_DID;
2295 did_old = domain_id_iommu(info->domain, iommu);
2297 did_old = context_domain_id(context);
2300 context_clear_entry(context);
2301 __iommu_flush_cache(iommu, context, sizeof(*context));
2302 spin_unlock(&iommu->lock);
2303 iommu->flush.flush_context(iommu,
2305 (((u16)bus) << 8) | devfn,
2306 DMA_CCMD_MASK_NOBIT,
2307 DMA_CCMD_DEVICE_INVL);
2309 if (sm_supported(iommu))
2310 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2312 iommu->flush.flush_iotlb(iommu,
2318 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2321 static int domain_setup_first_level(struct intel_iommu *iommu,
2322 struct dmar_domain *domain,
2326 struct dma_pte *pgd = domain->pgd;
2331 * Skip top levels of page tables for iommu which has
2332 * less agaw than default. Unnecessary for PT mode.
2334 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2335 pgd = phys_to_virt(dma_pte_addr(pgd));
2336 if (!dma_pte_present(pgd))
2340 level = agaw_to_level(agaw);
2341 if (level != 4 && level != 5)
2344 if (pasid != PASID_RID2PASID)
2345 flags |= PASID_FLAG_SUPERVISOR_MODE;
2347 flags |= PASID_FLAG_FL5LP;
2349 if (domain->force_snooping)
2350 flags |= PASID_FLAG_PAGE_SNOOP;
2352 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2353 domain_id_iommu(domain, iommu),
2357 static bool dev_is_real_dma_subdevice(struct device *dev)
2359 return dev && dev_is_pci(dev) &&
2360 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2363 static int iommu_domain_identity_map(struct dmar_domain *domain,
2364 unsigned long first_vpfn,
2365 unsigned long last_vpfn)
2368 * RMRR range might have overlap with physical memory range,
2371 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2373 return __domain_mapping(domain, first_vpfn,
2374 first_vpfn, last_vpfn - first_vpfn + 1,
2375 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2378 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2380 static int __init si_domain_init(int hw)
2382 struct dmar_rmrr_unit *rmrr;
2386 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2390 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2391 domain_exit(si_domain);
2399 for_each_online_node(nid) {
2400 unsigned long start_pfn, end_pfn;
2403 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2404 ret = iommu_domain_identity_map(si_domain,
2405 mm_to_dma_pfn(start_pfn),
2406 mm_to_dma_pfn(end_pfn));
2413 * Identity map the RMRRs so that devices with RMRRs could also use
2416 for_each_rmrr_units(rmrr) {
2417 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2419 unsigned long long start = rmrr->base_address;
2420 unsigned long long end = rmrr->end_address;
2422 if (WARN_ON(end < start ||
2423 end >> agaw_to_width(si_domain->agaw)))
2426 ret = iommu_domain_identity_map(si_domain,
2427 mm_to_dma_pfn(start >> PAGE_SHIFT),
2428 mm_to_dma_pfn(end >> PAGE_SHIFT));
2437 static int dmar_domain_attach_device(struct dmar_domain *domain,
2440 struct device_domain_info *info = dev_iommu_priv_get(dev);
2441 struct intel_iommu *iommu;
2442 unsigned long flags;
2446 iommu = device_to_iommu(dev, &bus, &devfn);
2450 ret = domain_attach_iommu(domain, iommu);
2453 info->domain = domain;
2454 spin_lock_irqsave(&domain->lock, flags);
2455 list_add(&info->link, &domain->devices);
2456 spin_unlock_irqrestore(&domain->lock, flags);
2458 /* PASID table is mandatory for a PCI device in scalable mode. */
2459 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2460 /* Setup the PASID entry for requests without PASID: */
2461 if (hw_pass_through && domain_type_is_si(domain))
2462 ret = intel_pasid_setup_pass_through(iommu, domain,
2463 dev, PASID_RID2PASID);
2464 else if (domain->use_first_level)
2465 ret = domain_setup_first_level(iommu, domain, dev,
2468 ret = intel_pasid_setup_second_level(iommu, domain,
2469 dev, PASID_RID2PASID);
2471 dev_err(dev, "Setup RID2PASID failed\n");
2472 device_block_translation(dev);
2477 ret = domain_context_mapping(domain, dev);
2479 dev_err(dev, "Domain context map failed\n");
2480 device_block_translation(dev);
2484 iommu_enable_pci_caps(info);
2489 static bool device_has_rmrr(struct device *dev)
2491 struct dmar_rmrr_unit *rmrr;
2496 for_each_rmrr_units(rmrr) {
2498 * Return TRUE if this RMRR contains the device that
2501 for_each_active_dev_scope(rmrr->devices,
2502 rmrr->devices_cnt, i, tmp)
2504 is_downstream_to_pci_bridge(dev, tmp)) {
2514 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2515 * is relaxable (ie. is allowed to be not enforced under some conditions)
2516 * @dev: device handle
2518 * We assume that PCI USB devices with RMRRs have them largely
2519 * for historical reasons and that the RMRR space is not actively used post
2520 * boot. This exclusion may change if vendors begin to abuse it.
2522 * The same exception is made for graphics devices, with the requirement that
2523 * any use of the RMRR regions will be torn down before assigning the device
2526 * Return: true if the RMRR is relaxable, false otherwise
2528 static bool device_rmrr_is_relaxable(struct device *dev)
2530 struct pci_dev *pdev;
2532 if (!dev_is_pci(dev))
2535 pdev = to_pci_dev(dev);
2536 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2543 * There are a couple cases where we need to restrict the functionality of
2544 * devices associated with RMRRs. The first is when evaluating a device for
2545 * identity mapping because problems exist when devices are moved in and out
2546 * of domains and their respective RMRR information is lost. This means that
2547 * a device with associated RMRRs will never be in a "passthrough" domain.
2548 * The second is use of the device through the IOMMU API. This interface
2549 * expects to have full control of the IOVA space for the device. We cannot
2550 * satisfy both the requirement that RMRR access is maintained and have an
2551 * unencumbered IOVA space. We also have no ability to quiesce the device's
2552 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2553 * We therefore prevent devices associated with an RMRR from participating in
2554 * the IOMMU API, which eliminates them from device assignment.
2556 * In both cases, devices which have relaxable RMRRs are not concerned by this
2557 * restriction. See device_rmrr_is_relaxable comment.
2559 static bool device_is_rmrr_locked(struct device *dev)
2561 if (!device_has_rmrr(dev))
2564 if (device_rmrr_is_relaxable(dev))
2571 * Return the required default domain type for a specific device.
2573 * @dev: the device in query
2574 * @startup: true if this is during early boot
2577 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2578 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2579 * - 0: both identity and dynamic domains work for this device
2581 static int device_def_domain_type(struct device *dev)
2583 if (dev_is_pci(dev)) {
2584 struct pci_dev *pdev = to_pci_dev(dev);
2586 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2587 return IOMMU_DOMAIN_IDENTITY;
2589 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2590 return IOMMU_DOMAIN_IDENTITY;
2596 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2599 * Start from the sane iommu hardware state.
2600 * If the queued invalidation is already initialized by us
2601 * (for example, while enabling interrupt-remapping) then
2602 * we got the things already rolling from a sane state.
2606 * Clear any previous faults.
2608 dmar_fault(-1, iommu);
2610 * Disable queued invalidation if supported and already enabled
2611 * before OS handover.
2613 dmar_disable_qi(iommu);
2616 if (dmar_enable_qi(iommu)) {
2618 * Queued Invalidate not enabled, use Register Based Invalidate
2620 iommu->flush.flush_context = __iommu_flush_context;
2621 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2622 pr_info("%s: Using Register based invalidation\n",
2625 iommu->flush.flush_context = qi_flush_context;
2626 iommu->flush.flush_iotlb = qi_flush_iotlb;
2627 pr_info("%s: Using Queued invalidation\n", iommu->name);
2631 static int copy_context_table(struct intel_iommu *iommu,
2632 struct root_entry *old_re,
2633 struct context_entry **tbl,
2636 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2637 struct context_entry *new_ce = NULL, ce;
2638 struct context_entry *old_ce = NULL;
2639 struct root_entry re;
2640 phys_addr_t old_ce_phys;
2642 tbl_idx = ext ? bus * 2 : bus;
2643 memcpy(&re, old_re, sizeof(re));
2645 for (devfn = 0; devfn < 256; devfn++) {
2646 /* First calculate the correct index */
2647 idx = (ext ? devfn * 2 : devfn) % 256;
2650 /* First save what we may have and clean up */
2652 tbl[tbl_idx] = new_ce;
2653 __iommu_flush_cache(iommu, new_ce,
2663 old_ce_phys = root_entry_lctp(&re);
2665 old_ce_phys = root_entry_uctp(&re);
2668 if (ext && devfn == 0) {
2669 /* No LCTP, try UCTP */
2678 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2683 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2690 /* Now copy the context entry */
2691 memcpy(&ce, old_ce + idx, sizeof(ce));
2693 if (!context_present(&ce))
2696 did = context_domain_id(&ce);
2697 if (did >= 0 && did < cap_ndoms(iommu->cap))
2698 set_bit(did, iommu->domain_ids);
2700 set_context_copied(iommu, bus, devfn);
2704 tbl[tbl_idx + pos] = new_ce;
2706 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2715 static int copy_translation_tables(struct intel_iommu *iommu)
2717 struct context_entry **ctxt_tbls;
2718 struct root_entry *old_rt;
2719 phys_addr_t old_rt_phys;
2720 int ctxt_table_entries;
2725 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2726 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2727 new_ext = !!sm_supported(iommu);
2730 * The RTT bit can only be changed when translation is disabled,
2731 * but disabling translation means to open a window for data
2732 * corruption. So bail out and don't copy anything if we would
2733 * have to change the bit.
2738 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2739 if (!iommu->copied_tables)
2742 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2746 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2750 /* This is too big for the stack - allocate it from slab */
2751 ctxt_table_entries = ext ? 512 : 256;
2753 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2757 for (bus = 0; bus < 256; bus++) {
2758 ret = copy_context_table(iommu, &old_rt[bus],
2759 ctxt_tbls, bus, ext);
2761 pr_err("%s: Failed to copy context table for bus %d\n",
2767 spin_lock(&iommu->lock);
2769 /* Context tables are copied, now write them to the root_entry table */
2770 for (bus = 0; bus < 256; bus++) {
2771 int idx = ext ? bus * 2 : bus;
2774 if (ctxt_tbls[idx]) {
2775 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2776 iommu->root_entry[bus].lo = val;
2779 if (!ext || !ctxt_tbls[idx + 1])
2782 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2783 iommu->root_entry[bus].hi = val;
2786 spin_unlock(&iommu->lock);
2790 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2800 #ifdef CONFIG_INTEL_IOMMU_SVM
2801 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2803 struct intel_iommu *iommu = data;
2807 return INVALID_IOASID;
2809 * VT-d virtual command interface always uses the full 20 bit
2810 * PASID range. Host can partition guest PASID range based on
2811 * policies but it is out of guest's control.
2813 if (min < PASID_MIN || max > intel_pasid_max_id)
2814 return INVALID_IOASID;
2816 if (vcmd_alloc_pasid(iommu, &ioasid))
2817 return INVALID_IOASID;
2822 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2824 struct intel_iommu *iommu = data;
2829 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2830 * We can only free the PASID when all the devices are unbound.
2832 if (ioasid_find(NULL, ioasid, NULL)) {
2833 pr_alert("Cannot free active IOASID %d\n", ioasid);
2836 vcmd_free_pasid(iommu, ioasid);
2839 static void register_pasid_allocator(struct intel_iommu *iommu)
2842 * If we are running in the host, no need for custom allocator
2843 * in that PASIDs are allocated from the host system-wide.
2845 if (!cap_caching_mode(iommu->cap))
2848 if (!sm_supported(iommu)) {
2849 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2854 * Register a custom PASID allocator if we are running in a guest,
2855 * guest PASID must be obtained via virtual command interface.
2856 * There can be multiple vIOMMUs in each guest but only one allocator
2857 * is active. All vIOMMU allocators will eventually be calling the same
2860 if (!vccap_pasid(iommu->vccap))
2863 pr_info("Register custom PASID allocator\n");
2864 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2865 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2866 iommu->pasid_allocator.pdata = (void *)iommu;
2867 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2868 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2870 * Disable scalable mode on this IOMMU if there
2871 * is no custom allocator. Mixing SM capable vIOMMU
2872 * and non-SM vIOMMU are not supported.
2879 static int __init init_dmars(void)
2881 struct dmar_drhd_unit *drhd;
2882 struct intel_iommu *iommu;
2885 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2889 for_each_iommu(iommu, drhd) {
2890 if (drhd->ignored) {
2891 iommu_disable_translation(iommu);
2896 * Find the max pasid size of all IOMMU's in the system.
2897 * We need to ensure the system pasid table is no bigger
2898 * than the smallest supported.
2900 if (pasid_supported(iommu)) {
2901 u32 temp = 2 << ecap_pss(iommu->ecap);
2903 intel_pasid_max_id = min_t(u32, temp,
2904 intel_pasid_max_id);
2907 intel_iommu_init_qi(iommu);
2909 ret = iommu_init_domains(iommu);
2913 init_translation_status(iommu);
2915 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2916 iommu_disable_translation(iommu);
2917 clear_translation_pre_enabled(iommu);
2918 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2924 * we could share the same root & context tables
2925 * among all IOMMU's. Need to Split it later.
2927 ret = iommu_alloc_root_entry(iommu);
2931 if (translation_pre_enabled(iommu)) {
2932 pr_info("Translation already enabled - trying to copy translation structures\n");
2934 ret = copy_translation_tables(iommu);
2937 * We found the IOMMU with translation
2938 * enabled - but failed to copy over the
2939 * old root-entry table. Try to proceed
2940 * by disabling translation now and
2941 * allocating a clean root-entry table.
2942 * This might cause DMAR faults, but
2943 * probably the dump will still succeed.
2945 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2947 iommu_disable_translation(iommu);
2948 clear_translation_pre_enabled(iommu);
2950 pr_info("Copied translation tables from previous kernel for %s\n",
2955 if (!ecap_pass_through(iommu->ecap))
2956 hw_pass_through = 0;
2957 intel_svm_check(iommu);
2961 * Now that qi is enabled on all iommus, set the root entry and flush
2962 * caches. This is required on some Intel X58 chipsets, otherwise the
2963 * flush_context function will loop forever and the boot hangs.
2965 for_each_active_iommu(iommu, drhd) {
2966 iommu_flush_write_buffer(iommu);
2967 #ifdef CONFIG_INTEL_IOMMU_SVM
2968 register_pasid_allocator(iommu);
2970 iommu_set_root_entry(iommu);
2973 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2978 iommu_identity_mapping |= IDENTMAP_GFX;
2980 check_tylersburg_isoch();
2982 ret = si_domain_init(hw_pass_through);
2989 * global invalidate context cache
2990 * global invalidate iotlb
2991 * enable translation
2993 for_each_iommu(iommu, drhd) {
2994 if (drhd->ignored) {
2996 * we always have to disable PMRs or DMA may fail on
3000 iommu_disable_protect_mem_regions(iommu);
3004 iommu_flush_write_buffer(iommu);
3006 #ifdef CONFIG_INTEL_IOMMU_SVM
3007 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3009 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3010 * could cause possible lock race condition.
3012 up_write(&dmar_global_lock);
3013 ret = intel_svm_enable_prq(iommu);
3014 down_write(&dmar_global_lock);
3019 ret = dmar_set_interrupt(iommu);
3027 for_each_active_iommu(iommu, drhd) {
3028 disable_dmar_iommu(iommu);
3029 free_dmar_iommu(iommu);
3032 domain_exit(si_domain);
3039 static void __init init_no_remapping_devices(void)
3041 struct dmar_drhd_unit *drhd;
3045 for_each_drhd_unit(drhd) {
3046 if (!drhd->include_all) {
3047 for_each_active_dev_scope(drhd->devices,
3048 drhd->devices_cnt, i, dev)
3050 /* ignore DMAR unit if no devices exist */
3051 if (i == drhd->devices_cnt)
3056 for_each_active_drhd_unit(drhd) {
3057 if (drhd->include_all)
3060 for_each_active_dev_scope(drhd->devices,
3061 drhd->devices_cnt, i, dev)
3062 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3064 if (i < drhd->devices_cnt)
3067 /* This IOMMU has *only* gfx devices. Either bypass it or
3068 set the gfx_mapped flag, as appropriate */
3069 drhd->gfx_dedicated = 1;
3075 #ifdef CONFIG_SUSPEND
3076 static int init_iommu_hw(void)
3078 struct dmar_drhd_unit *drhd;
3079 struct intel_iommu *iommu = NULL;
3081 for_each_active_iommu(iommu, drhd)
3083 dmar_reenable_qi(iommu);
3085 for_each_iommu(iommu, drhd) {
3086 if (drhd->ignored) {
3088 * we always have to disable PMRs or DMA may fail on
3092 iommu_disable_protect_mem_regions(iommu);
3096 iommu_flush_write_buffer(iommu);
3097 iommu_set_root_entry(iommu);
3098 iommu_enable_translation(iommu);
3099 iommu_disable_protect_mem_regions(iommu);
3105 static void iommu_flush_all(void)
3107 struct dmar_drhd_unit *drhd;
3108 struct intel_iommu *iommu;
3110 for_each_active_iommu(iommu, drhd) {
3111 iommu->flush.flush_context(iommu, 0, 0, 0,
3112 DMA_CCMD_GLOBAL_INVL);
3113 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3114 DMA_TLB_GLOBAL_FLUSH);
3118 static int iommu_suspend(void)
3120 struct dmar_drhd_unit *drhd;
3121 struct intel_iommu *iommu = NULL;
3124 for_each_active_iommu(iommu, drhd) {
3125 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3127 if (!iommu->iommu_state)
3133 for_each_active_iommu(iommu, drhd) {
3134 iommu_disable_translation(iommu);
3136 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3138 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3139 readl(iommu->reg + DMAR_FECTL_REG);
3140 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3141 readl(iommu->reg + DMAR_FEDATA_REG);
3142 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3143 readl(iommu->reg + DMAR_FEADDR_REG);
3144 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3145 readl(iommu->reg + DMAR_FEUADDR_REG);
3147 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3152 for_each_active_iommu(iommu, drhd)
3153 kfree(iommu->iommu_state);
3158 static void iommu_resume(void)
3160 struct dmar_drhd_unit *drhd;
3161 struct intel_iommu *iommu = NULL;
3164 if (init_iommu_hw()) {
3166 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3168 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3172 for_each_active_iommu(iommu, drhd) {
3174 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3176 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3177 iommu->reg + DMAR_FECTL_REG);
3178 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3179 iommu->reg + DMAR_FEDATA_REG);
3180 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3181 iommu->reg + DMAR_FEADDR_REG);
3182 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3183 iommu->reg + DMAR_FEUADDR_REG);
3185 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3188 for_each_active_iommu(iommu, drhd)
3189 kfree(iommu->iommu_state);
3192 static struct syscore_ops iommu_syscore_ops = {
3193 .resume = iommu_resume,
3194 .suspend = iommu_suspend,
3197 static void __init init_iommu_pm_ops(void)
3199 register_syscore_ops(&iommu_syscore_ops);
3203 static inline void init_iommu_pm_ops(void) {}
3204 #endif /* CONFIG_PM */
3206 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3208 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3209 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3210 rmrr->end_address <= rmrr->base_address ||
3211 arch_rmrr_sanity_check(rmrr))
3217 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3219 struct acpi_dmar_reserved_memory *rmrr;
3220 struct dmar_rmrr_unit *rmrru;
3222 rmrr = (struct acpi_dmar_reserved_memory *)header;
3223 if (rmrr_sanity_check(rmrr)) {
3225 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3226 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3227 rmrr->base_address, rmrr->end_address,
3228 dmi_get_system_info(DMI_BIOS_VENDOR),
3229 dmi_get_system_info(DMI_BIOS_VERSION),
3230 dmi_get_system_info(DMI_PRODUCT_VERSION));
3231 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3234 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3238 rmrru->hdr = header;
3240 rmrru->base_address = rmrr->base_address;
3241 rmrru->end_address = rmrr->end_address;
3243 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3244 ((void *)rmrr) + rmrr->header.length,
3245 &rmrru->devices_cnt);
3246 if (rmrru->devices_cnt && rmrru->devices == NULL)
3249 list_add(&rmrru->list, &dmar_rmrr_units);
3258 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3260 struct dmar_atsr_unit *atsru;
3261 struct acpi_dmar_atsr *tmp;
3263 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3265 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3266 if (atsr->segment != tmp->segment)
3268 if (atsr->header.length != tmp->header.length)
3270 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3277 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3279 struct acpi_dmar_atsr *atsr;
3280 struct dmar_atsr_unit *atsru;
3282 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3285 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3286 atsru = dmar_find_atsr(atsr);
3290 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3295 * If memory is allocated from slab by ACPI _DSM method, we need to
3296 * copy the memory content because the memory buffer will be freed
3299 atsru->hdr = (void *)(atsru + 1);
3300 memcpy(atsru->hdr, hdr, hdr->length);
3301 atsru->include_all = atsr->flags & 0x1;
3302 if (!atsru->include_all) {
3303 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3304 (void *)atsr + atsr->header.length,
3305 &atsru->devices_cnt);
3306 if (atsru->devices_cnt && atsru->devices == NULL) {
3312 list_add_rcu(&atsru->list, &dmar_atsr_units);
3317 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3319 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3323 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3325 struct acpi_dmar_atsr *atsr;
3326 struct dmar_atsr_unit *atsru;
3328 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3329 atsru = dmar_find_atsr(atsr);
3331 list_del_rcu(&atsru->list);
3333 intel_iommu_free_atsr(atsru);
3339 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3343 struct acpi_dmar_atsr *atsr;
3344 struct dmar_atsr_unit *atsru;
3346 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3347 atsru = dmar_find_atsr(atsr);
3351 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3352 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3360 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3362 struct dmar_satc_unit *satcu;
3363 struct acpi_dmar_satc *tmp;
3365 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3367 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3368 if (satc->segment != tmp->segment)
3370 if (satc->header.length != tmp->header.length)
3372 if (memcmp(satc, tmp, satc->header.length) == 0)
3379 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3381 struct acpi_dmar_satc *satc;
3382 struct dmar_satc_unit *satcu;
3384 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3387 satc = container_of(hdr, struct acpi_dmar_satc, header);
3388 satcu = dmar_find_satc(satc);
3392 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3396 satcu->hdr = (void *)(satcu + 1);
3397 memcpy(satcu->hdr, hdr, hdr->length);
3398 satcu->atc_required = satc->flags & 0x1;
3399 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3400 (void *)satc + satc->header.length,
3401 &satcu->devices_cnt);
3402 if (satcu->devices_cnt && !satcu->devices) {
3406 list_add_rcu(&satcu->list, &dmar_satc_units);
3411 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3414 struct intel_iommu *iommu = dmaru->iommu;
3416 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3420 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3421 pr_warn("%s: Doesn't support hardware pass through.\n",
3426 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3427 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3428 pr_warn("%s: Doesn't support large page.\n",
3434 * Disable translation if already enabled prior to OS handover.
3436 if (iommu->gcmd & DMA_GCMD_TE)
3437 iommu_disable_translation(iommu);
3439 ret = iommu_init_domains(iommu);
3441 ret = iommu_alloc_root_entry(iommu);
3445 intel_svm_check(iommu);
3447 if (dmaru->ignored) {
3449 * we always have to disable PMRs or DMA may fail on this device
3452 iommu_disable_protect_mem_regions(iommu);
3456 intel_iommu_init_qi(iommu);
3457 iommu_flush_write_buffer(iommu);
3459 #ifdef CONFIG_INTEL_IOMMU_SVM
3460 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3461 ret = intel_svm_enable_prq(iommu);
3466 ret = dmar_set_interrupt(iommu);
3470 iommu_set_root_entry(iommu);
3471 iommu_enable_translation(iommu);
3473 iommu_disable_protect_mem_regions(iommu);
3477 disable_dmar_iommu(iommu);
3479 free_dmar_iommu(iommu);
3483 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3486 struct intel_iommu *iommu = dmaru->iommu;
3488 if (!intel_iommu_enabled)
3494 ret = intel_iommu_add(dmaru);
3496 disable_dmar_iommu(iommu);
3497 free_dmar_iommu(iommu);
3503 static void intel_iommu_free_dmars(void)
3505 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3506 struct dmar_atsr_unit *atsru, *atsr_n;
3507 struct dmar_satc_unit *satcu, *satc_n;
3509 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3510 list_del(&rmrru->list);
3511 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3515 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3516 list_del(&atsru->list);
3517 intel_iommu_free_atsr(atsru);
3519 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3520 list_del(&satcu->list);
3521 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3526 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3528 struct dmar_satc_unit *satcu;
3529 struct acpi_dmar_satc *satc;
3533 dev = pci_physfn(dev);
3536 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3537 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3538 if (satc->segment != pci_domain_nr(dev->bus))
3540 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3541 if (to_pci_dev(tmp) == dev)
3550 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3553 struct pci_bus *bus;
3554 struct pci_dev *bridge = NULL;
3556 struct acpi_dmar_atsr *atsr;
3557 struct dmar_atsr_unit *atsru;
3558 struct dmar_satc_unit *satcu;
3560 dev = pci_physfn(dev);
3561 satcu = dmar_find_matched_satc_unit(dev);
3564 * This device supports ATS as it is in SATC table.
3565 * When IOMMU is in legacy mode, enabling ATS is done
3566 * automatically by HW for the device that requires
3567 * ATS, hence OS should not enable this device ATS
3568 * to avoid duplicated TLB invalidation.
3570 return !(satcu->atc_required && !sm_supported(iommu));
3572 for (bus = dev->bus; bus; bus = bus->parent) {
3574 /* If it's an integrated device, allow ATS */
3577 /* Connected via non-PCIe: no ATS */
3578 if (!pci_is_pcie(bridge) ||
3579 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3581 /* If we found the root port, look it up in the ATSR */
3582 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3587 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3588 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3589 if (atsr->segment != pci_domain_nr(dev->bus))
3592 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3593 if (tmp == &bridge->dev)
3596 if (atsru->include_all)
3606 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3609 struct dmar_rmrr_unit *rmrru;
3610 struct dmar_atsr_unit *atsru;
3611 struct dmar_satc_unit *satcu;
3612 struct acpi_dmar_atsr *atsr;
3613 struct acpi_dmar_reserved_memory *rmrr;
3614 struct acpi_dmar_satc *satc;
3616 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3619 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3620 rmrr = container_of(rmrru->hdr,
3621 struct acpi_dmar_reserved_memory, header);
3622 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3623 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3624 ((void *)rmrr) + rmrr->header.length,
3625 rmrr->segment, rmrru->devices,
3626 rmrru->devices_cnt);
3629 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3630 dmar_remove_dev_scope(info, rmrr->segment,
3631 rmrru->devices, rmrru->devices_cnt);
3635 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3636 if (atsru->include_all)
3639 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3640 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3641 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3642 (void *)atsr + atsr->header.length,
3643 atsr->segment, atsru->devices,
3644 atsru->devices_cnt);
3649 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3650 if (dmar_remove_dev_scope(info, atsr->segment,
3651 atsru->devices, atsru->devices_cnt))
3655 list_for_each_entry(satcu, &dmar_satc_units, list) {
3656 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3657 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3658 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3659 (void *)satc + satc->header.length,
3660 satc->segment, satcu->devices,
3661 satcu->devices_cnt);
3666 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3667 if (dmar_remove_dev_scope(info, satc->segment,
3668 satcu->devices, satcu->devices_cnt))
3676 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3677 unsigned long val, void *v)
3679 struct memory_notify *mhp = v;
3680 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3681 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3685 case MEM_GOING_ONLINE:
3686 if (iommu_domain_identity_map(si_domain,
3687 start_vpfn, last_vpfn)) {
3688 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3689 start_vpfn, last_vpfn);
3695 case MEM_CANCEL_ONLINE:
3697 struct dmar_drhd_unit *drhd;
3698 struct intel_iommu *iommu;
3699 LIST_HEAD(freelist);
3701 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3704 for_each_active_iommu(iommu, drhd)
3705 iommu_flush_iotlb_psi(iommu, si_domain,
3706 start_vpfn, mhp->nr_pages,
3707 list_empty(&freelist), 0);
3709 put_pages_list(&freelist);
3717 static struct notifier_block intel_iommu_memory_nb = {
3718 .notifier_call = intel_iommu_memory_notifier,
3722 static void intel_disable_iommus(void)
3724 struct intel_iommu *iommu = NULL;
3725 struct dmar_drhd_unit *drhd;
3727 for_each_iommu(iommu, drhd)
3728 iommu_disable_translation(iommu);
3731 void intel_iommu_shutdown(void)
3733 struct dmar_drhd_unit *drhd;
3734 struct intel_iommu *iommu = NULL;
3736 if (no_iommu || dmar_disabled)
3739 down_write(&dmar_global_lock);
3741 /* Disable PMRs explicitly here. */
3742 for_each_iommu(iommu, drhd)
3743 iommu_disable_protect_mem_regions(iommu);
3745 /* Make sure the IOMMUs are switched off */
3746 intel_disable_iommus();
3748 up_write(&dmar_global_lock);
3751 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3753 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3755 return container_of(iommu_dev, struct intel_iommu, iommu);
3758 static ssize_t version_show(struct device *dev,
3759 struct device_attribute *attr, char *buf)
3761 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3762 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3763 return sprintf(buf, "%d:%d\n",
3764 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3766 static DEVICE_ATTR_RO(version);
3768 static ssize_t address_show(struct device *dev,
3769 struct device_attribute *attr, char *buf)
3771 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3772 return sprintf(buf, "%llx\n", iommu->reg_phys);
3774 static DEVICE_ATTR_RO(address);
3776 static ssize_t cap_show(struct device *dev,
3777 struct device_attribute *attr, char *buf)
3779 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3780 return sprintf(buf, "%llx\n", iommu->cap);
3782 static DEVICE_ATTR_RO(cap);
3784 static ssize_t ecap_show(struct device *dev,
3785 struct device_attribute *attr, char *buf)
3787 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3788 return sprintf(buf, "%llx\n", iommu->ecap);
3790 static DEVICE_ATTR_RO(ecap);
3792 static ssize_t domains_supported_show(struct device *dev,
3793 struct device_attribute *attr, char *buf)
3795 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3796 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3798 static DEVICE_ATTR_RO(domains_supported);
3800 static ssize_t domains_used_show(struct device *dev,
3801 struct device_attribute *attr, char *buf)
3803 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3804 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3805 cap_ndoms(iommu->cap)));
3807 static DEVICE_ATTR_RO(domains_used);
3809 static struct attribute *intel_iommu_attrs[] = {
3810 &dev_attr_version.attr,
3811 &dev_attr_address.attr,
3813 &dev_attr_ecap.attr,
3814 &dev_attr_domains_supported.attr,
3815 &dev_attr_domains_used.attr,
3819 static struct attribute_group intel_iommu_group = {
3820 .name = "intel-iommu",
3821 .attrs = intel_iommu_attrs,
3824 const struct attribute_group *intel_iommu_groups[] = {
3829 static inline bool has_external_pci(void)
3831 struct pci_dev *pdev = NULL;
3833 for_each_pci_dev(pdev)
3834 if (pdev->external_facing) {
3842 static int __init platform_optin_force_iommu(void)
3844 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3847 if (no_iommu || dmar_disabled)
3848 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3851 * If Intel-IOMMU is disabled by default, we will apply identity
3852 * map for all devices except those marked as being untrusted.
3855 iommu_set_default_passthrough(false);
3863 static int __init probe_acpi_namespace_devices(void)
3865 struct dmar_drhd_unit *drhd;
3866 /* To avoid a -Wunused-but-set-variable warning. */
3867 struct intel_iommu *iommu __maybe_unused;
3871 for_each_active_iommu(iommu, drhd) {
3872 for_each_active_dev_scope(drhd->devices,
3873 drhd->devices_cnt, i, dev) {
3874 struct acpi_device_physical_node *pn;
3875 struct iommu_group *group;
3876 struct acpi_device *adev;
3878 if (dev->bus != &acpi_bus_type)
3881 adev = to_acpi_device(dev);
3882 mutex_lock(&adev->physical_node_lock);
3883 list_for_each_entry(pn,
3884 &adev->physical_node_list, node) {
3885 group = iommu_group_get(pn->dev);
3887 iommu_group_put(group);
3891 ret = iommu_probe_device(pn->dev);
3895 mutex_unlock(&adev->physical_node_lock);
3905 static __init int tboot_force_iommu(void)
3907 if (!tboot_enabled())
3910 if (no_iommu || dmar_disabled)
3911 pr_warn("Forcing Intel-IOMMU to enabled\n");
3919 int __init intel_iommu_init(void)
3922 struct dmar_drhd_unit *drhd;
3923 struct intel_iommu *iommu;
3926 * Intel IOMMU is required for a TXT/tboot launch or platform
3927 * opt in, so enforce that.
3929 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3930 platform_optin_force_iommu();
3932 down_write(&dmar_global_lock);
3933 if (dmar_table_init()) {
3935 panic("tboot: Failed to initialize DMAR table\n");
3939 if (dmar_dev_scope_init() < 0) {
3941 panic("tboot: Failed to initialize DMAR device scope\n");
3945 up_write(&dmar_global_lock);
3948 * The bus notifier takes the dmar_global_lock, so lockdep will
3949 * complain later when we register it under the lock.
3951 dmar_register_bus_notifier();
3953 down_write(&dmar_global_lock);
3956 intel_iommu_debugfs_init();
3958 if (no_iommu || dmar_disabled) {
3960 * We exit the function here to ensure IOMMU's remapping and
3961 * mempool aren't setup, which means that the IOMMU's PMRs
3962 * won't be disabled via the call to init_dmars(). So disable
3963 * it explicitly here. The PMRs were setup by tboot prior to
3964 * calling SENTER, but the kernel is expected to reset/tear
3967 if (intel_iommu_tboot_noforce) {
3968 for_each_iommu(iommu, drhd)
3969 iommu_disable_protect_mem_regions(iommu);
3973 * Make sure the IOMMUs are switched off, even when we
3974 * boot into a kexec kernel and the previous kernel left
3977 intel_disable_iommus();
3981 if (list_empty(&dmar_rmrr_units))
3982 pr_info("No RMRR found\n");
3984 if (list_empty(&dmar_atsr_units))
3985 pr_info("No ATSR found\n");
3987 if (list_empty(&dmar_satc_units))
3988 pr_info("No SATC found\n");
3990 init_no_remapping_devices();
3995 panic("tboot: Failed to initialize DMARs\n");
3996 pr_err("Initialization failed\n");
3999 up_write(&dmar_global_lock);
4001 init_iommu_pm_ops();
4003 down_read(&dmar_global_lock);
4004 for_each_active_iommu(iommu, drhd) {
4006 * The flush queue implementation does not perform
4007 * page-selective invalidations that are required for efficient
4008 * TLB flushes in virtual environments. The benefit of batching
4009 * is likely to be much lower than the overhead of synchronizing
4010 * the virtual and physical IOMMU page-tables.
4012 if (cap_caching_mode(iommu->cap) &&
4013 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
4014 pr_info_once("IOMMU batching disallowed due to virtualization\n");
4015 iommu_set_dma_strict();
4017 iommu_device_sysfs_add(&iommu->iommu, NULL,
4020 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4022 iommu_pmu_register(iommu);
4024 up_read(&dmar_global_lock);
4026 if (si_domain && !hw_pass_through)
4027 register_memory_notifier(&intel_iommu_memory_nb);
4029 down_read(&dmar_global_lock);
4030 if (probe_acpi_namespace_devices())
4031 pr_warn("ACPI name space devices didn't probe correctly\n");
4033 /* Finally, we enable the DMA remapping hardware. */
4034 for_each_iommu(iommu, drhd) {
4035 if (!drhd->ignored && !translation_pre_enabled(iommu))
4036 iommu_enable_translation(iommu);
4038 iommu_disable_protect_mem_regions(iommu);
4040 up_read(&dmar_global_lock);
4042 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4044 intel_iommu_enabled = 1;
4049 intel_iommu_free_dmars();
4050 up_write(&dmar_global_lock);
4054 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4056 struct device_domain_info *info = opaque;
4058 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4063 * NB - intel-iommu lacks any sort of reference counting for the users of
4064 * dependent devices. If multiple endpoints have intersecting dependent
4065 * devices, unbinding the driver from any one of them will possibly leave
4066 * the others unable to operate.
4068 static void domain_context_clear(struct device_domain_info *info)
4070 if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4073 pci_for_each_dma_alias(to_pci_dev(info->dev),
4074 &domain_context_clear_one_cb, info);
4077 static void dmar_remove_one_dev_info(struct device *dev)
4079 struct device_domain_info *info = dev_iommu_priv_get(dev);
4080 struct dmar_domain *domain = info->domain;
4081 struct intel_iommu *iommu = info->iommu;
4082 unsigned long flags;
4084 if (!dev_is_real_dma_subdevice(info->dev)) {
4085 if (dev_is_pci(info->dev) && sm_supported(iommu))
4086 intel_pasid_tear_down_entry(iommu, info->dev,
4087 PASID_RID2PASID, false);
4089 iommu_disable_pci_caps(info);
4090 domain_context_clear(info);
4093 spin_lock_irqsave(&domain->lock, flags);
4094 list_del(&info->link);
4095 spin_unlock_irqrestore(&domain->lock, flags);
4097 domain_detach_iommu(domain, iommu);
4098 info->domain = NULL;
4102 * Clear the page table pointer in context or pasid table entries so that
4103 * all DMA requests without PASID from the device are blocked. If the page
4104 * table has been set, clean up the data structures.
4106 static void device_block_translation(struct device *dev)
4108 struct device_domain_info *info = dev_iommu_priv_get(dev);
4109 struct intel_iommu *iommu = info->iommu;
4110 unsigned long flags;
4112 iommu_disable_pci_caps(info);
4113 if (!dev_is_real_dma_subdevice(dev)) {
4114 if (sm_supported(iommu))
4115 intel_pasid_tear_down_entry(iommu, dev,
4116 PASID_RID2PASID, false);
4118 domain_context_clear(info);
4124 spin_lock_irqsave(&info->domain->lock, flags);
4125 list_del(&info->link);
4126 spin_unlock_irqrestore(&info->domain->lock, flags);
4128 domain_detach_iommu(info->domain, iommu);
4129 info->domain = NULL;
4132 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4136 /* calculate AGAW */
4137 domain->gaw = guest_width;
4138 adjust_width = guestwidth_to_adjustwidth(guest_width);
4139 domain->agaw = width_to_agaw(adjust_width);
4141 domain->iommu_coherency = false;
4142 domain->iommu_superpage = 0;
4143 domain->max_addr = 0;
4145 /* always allocate the top pgd */
4146 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4149 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4153 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4156 device_block_translation(dev);
4160 static struct iommu_domain blocking_domain = {
4161 .ops = &(const struct iommu_domain_ops) {
4162 .attach_dev = blocking_domain_attach_dev,
4163 .free = intel_iommu_domain_free
4167 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4169 struct dmar_domain *dmar_domain;
4170 struct iommu_domain *domain;
4173 case IOMMU_DOMAIN_BLOCKED:
4174 return &blocking_domain;
4175 case IOMMU_DOMAIN_DMA:
4176 case IOMMU_DOMAIN_DMA_FQ:
4177 case IOMMU_DOMAIN_UNMANAGED:
4178 dmar_domain = alloc_domain(type);
4180 pr_err("Can't allocate dmar_domain\n");
4183 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4184 pr_err("Domain initialization failed\n");
4185 domain_exit(dmar_domain);
4189 domain = &dmar_domain->domain;
4190 domain->geometry.aperture_start = 0;
4191 domain->geometry.aperture_end =
4192 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4193 domain->geometry.force_aperture = true;
4196 case IOMMU_DOMAIN_IDENTITY:
4197 return &si_domain->domain;
4198 case IOMMU_DOMAIN_SVA:
4199 return intel_svm_domain_alloc();
4207 static void intel_iommu_domain_free(struct iommu_domain *domain)
4209 if (domain != &si_domain->domain && domain != &blocking_domain)
4210 domain_exit(to_dmar_domain(domain));
4213 static int prepare_domain_attach_device(struct iommu_domain *domain,
4216 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4217 struct intel_iommu *iommu;
4220 iommu = device_to_iommu(dev, NULL, NULL);
4224 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4227 /* check if this iommu agaw is sufficient for max mapped address */
4228 addr_width = agaw_to_width(iommu->agaw);
4229 if (addr_width > cap_mgaw(iommu->cap))
4230 addr_width = cap_mgaw(iommu->cap);
4232 if (dmar_domain->max_addr > (1LL << addr_width))
4234 dmar_domain->gaw = addr_width;
4237 * Knock out extra levels of page tables if necessary
4239 while (iommu->agaw < dmar_domain->agaw) {
4240 struct dma_pte *pte;
4242 pte = dmar_domain->pgd;
4243 if (dma_pte_present(pte)) {
4244 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4245 free_pgtable_page(pte);
4247 dmar_domain->agaw--;
4253 static int intel_iommu_attach_device(struct iommu_domain *domain,
4256 struct device_domain_info *info = dev_iommu_priv_get(dev);
4259 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4260 device_is_rmrr_locked(dev)) {
4261 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4266 device_block_translation(dev);
4268 ret = prepare_domain_attach_device(domain, dev);
4272 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4275 static int intel_iommu_map(struct iommu_domain *domain,
4276 unsigned long iova, phys_addr_t hpa,
4277 size_t size, int iommu_prot, gfp_t gfp)
4279 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4283 if (iommu_prot & IOMMU_READ)
4284 prot |= DMA_PTE_READ;
4285 if (iommu_prot & IOMMU_WRITE)
4286 prot |= DMA_PTE_WRITE;
4287 if (dmar_domain->set_pte_snp)
4288 prot |= DMA_PTE_SNP;
4290 max_addr = iova + size;
4291 if (dmar_domain->max_addr < max_addr) {
4294 /* check if minimum agaw is sufficient for mapped address */
4295 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4296 if (end < max_addr) {
4297 pr_err("%s: iommu width (%d) is not "
4298 "sufficient for the mapped address (%llx)\n",
4299 __func__, dmar_domain->gaw, max_addr);
4302 dmar_domain->max_addr = max_addr;
4304 /* Round up size to next multiple of PAGE_SIZE, if it and
4305 the low bits of hpa would take us onto the next page */
4306 size = aligned_nrpages(hpa, size);
4307 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4308 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4311 static int intel_iommu_map_pages(struct iommu_domain *domain,
4312 unsigned long iova, phys_addr_t paddr,
4313 size_t pgsize, size_t pgcount,
4314 int prot, gfp_t gfp, size_t *mapped)
4316 unsigned long pgshift = __ffs(pgsize);
4317 size_t size = pgcount << pgshift;
4320 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4323 if (!IS_ALIGNED(iova | paddr, pgsize))
4326 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4333 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4334 unsigned long iova, size_t size,
4335 struct iommu_iotlb_gather *gather)
4337 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4338 unsigned long start_pfn, last_pfn;
4341 /* Cope with horrid API which requires us to unmap more than the
4342 size argument if it happens to be a large-page mapping. */
4343 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4346 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4347 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4349 start_pfn = iova >> VTD_PAGE_SHIFT;
4350 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4352 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4354 if (dmar_domain->max_addr == iova + size)
4355 dmar_domain->max_addr = iova;
4358 * We do not use page-selective IOTLB invalidation in flush queue,
4359 * so there is no need to track page and sync iotlb.
4361 if (!iommu_iotlb_gather_queued(gather))
4362 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4367 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4369 size_t pgsize, size_t pgcount,
4370 struct iommu_iotlb_gather *gather)
4372 unsigned long pgshift = __ffs(pgsize);
4373 size_t size = pgcount << pgshift;
4375 return intel_iommu_unmap(domain, iova, size, gather);
4378 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4379 struct iommu_iotlb_gather *gather)
4381 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4382 unsigned long iova_pfn = IOVA_PFN(gather->start);
4383 size_t size = gather->end - gather->start;
4384 struct iommu_domain_info *info;
4385 unsigned long start_pfn;
4386 unsigned long nrpages;
4389 nrpages = aligned_nrpages(gather->start, size);
4390 start_pfn = mm_to_dma_pfn(iova_pfn);
4392 xa_for_each(&dmar_domain->iommu_array, i, info)
4393 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4395 list_empty(&gather->freelist), 0);
4397 put_pages_list(&gather->freelist);
4400 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4403 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4404 struct dma_pte *pte;
4408 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4410 if (pte && dma_pte_present(pte))
4411 phys = dma_pte_addr(pte) +
4412 (iova & (BIT_MASK(level_to_offset_bits(level) +
4413 VTD_PAGE_SHIFT) - 1));
4418 static bool domain_support_force_snooping(struct dmar_domain *domain)
4420 struct device_domain_info *info;
4421 bool support = true;
4423 assert_spin_locked(&domain->lock);
4424 list_for_each_entry(info, &domain->devices, link) {
4425 if (!ecap_sc_support(info->iommu->ecap)) {
4434 static void domain_set_force_snooping(struct dmar_domain *domain)
4436 struct device_domain_info *info;
4438 assert_spin_locked(&domain->lock);
4440 * Second level page table supports per-PTE snoop control. The
4441 * iommu_map() interface will handle this by setting SNP bit.
4443 if (!domain->use_first_level) {
4444 domain->set_pte_snp = true;
4448 list_for_each_entry(info, &domain->devices, link)
4449 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4453 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4455 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4456 unsigned long flags;
4458 if (dmar_domain->force_snooping)
4461 spin_lock_irqsave(&dmar_domain->lock, flags);
4462 if (!domain_support_force_snooping(dmar_domain)) {
4463 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4467 domain_set_force_snooping(dmar_domain);
4468 dmar_domain->force_snooping = true;
4469 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4474 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4476 struct device_domain_info *info = dev_iommu_priv_get(dev);
4479 case IOMMU_CAP_CACHE_COHERENCY:
4481 case IOMMU_CAP_INTR_REMAP:
4482 return irq_remapping_enabled == 1;
4483 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4484 return dmar_platform_optin();
4485 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4486 return ecap_sc_support(info->iommu->ecap);
4492 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4494 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4495 struct device_domain_info *info;
4496 struct intel_iommu *iommu;
4500 iommu = device_to_iommu(dev, &bus, &devfn);
4501 if (!iommu || !iommu->iommu.ops)
4502 return ERR_PTR(-ENODEV);
4504 info = kzalloc(sizeof(*info), GFP_KERNEL);
4506 return ERR_PTR(-ENOMEM);
4508 if (dev_is_real_dma_subdevice(dev)) {
4509 info->bus = pdev->bus->number;
4510 info->devfn = pdev->devfn;
4511 info->segment = pci_domain_nr(pdev->bus);
4514 info->devfn = devfn;
4515 info->segment = iommu->segment;
4519 info->iommu = iommu;
4520 if (dev_is_pci(dev)) {
4521 if (ecap_dev_iotlb_support(iommu->ecap) &&
4522 pci_ats_supported(pdev) &&
4523 dmar_ats_supported(pdev, iommu)) {
4524 info->ats_supported = 1;
4525 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4527 if (sm_supported(iommu)) {
4528 if (pasid_supported(iommu)) {
4529 int features = pci_pasid_features(pdev);
4532 info->pasid_supported = features | 1;
4535 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4536 pci_pri_supported(pdev))
4537 info->pri_supported = 1;
4541 dev_iommu_priv_set(dev, info);
4543 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4544 ret = intel_pasid_alloc_table(dev);
4546 dev_err(dev, "PASID table allocation failed\n");
4547 dev_iommu_priv_set(dev, NULL);
4549 return ERR_PTR(ret);
4553 return &iommu->iommu;
4556 static void intel_iommu_release_device(struct device *dev)
4558 struct device_domain_info *info = dev_iommu_priv_get(dev);
4560 dmar_remove_one_dev_info(dev);
4561 intel_pasid_free_table(dev);
4562 dev_iommu_priv_set(dev, NULL);
4564 set_dma_ops(dev, NULL);
4567 static void intel_iommu_probe_finalize(struct device *dev)
4569 set_dma_ops(dev, NULL);
4570 iommu_setup_dma_ops(dev, 0, U64_MAX);
4573 static void intel_iommu_get_resv_regions(struct device *device,
4574 struct list_head *head)
4576 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4577 struct iommu_resv_region *reg;
4578 struct dmar_rmrr_unit *rmrr;
4579 struct device *i_dev;
4583 for_each_rmrr_units(rmrr) {
4584 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4586 struct iommu_resv_region *resv;
4587 enum iommu_resv_type type;
4590 if (i_dev != device &&
4591 !is_downstream_to_pci_bridge(device, i_dev))
4594 length = rmrr->end_address - rmrr->base_address + 1;
4596 type = device_rmrr_is_relaxable(device) ?
4597 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4599 resv = iommu_alloc_resv_region(rmrr->base_address,
4605 list_add_tail(&resv->list, head);
4610 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4611 if (dev_is_pci(device)) {
4612 struct pci_dev *pdev = to_pci_dev(device);
4614 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4615 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4616 IOMMU_RESV_DIRECT_RELAXABLE,
4619 list_add_tail(®->list, head);
4622 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4624 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4625 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4626 0, IOMMU_RESV_MSI, GFP_KERNEL);
4629 list_add_tail(®->list, head);
4632 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4634 if (dev_is_pci(dev))
4635 return pci_device_group(dev);
4636 return generic_device_group(dev);
4639 static int intel_iommu_enable_sva(struct device *dev)
4641 struct device_domain_info *info = dev_iommu_priv_get(dev);
4642 struct intel_iommu *iommu;
4645 if (!info || dmar_disabled)
4648 iommu = info->iommu;
4652 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4655 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4658 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4662 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4664 iopf_queue_remove_device(iommu->iopf_queue, dev);
4669 static int intel_iommu_disable_sva(struct device *dev)
4671 struct device_domain_info *info = dev_iommu_priv_get(dev);
4672 struct intel_iommu *iommu = info->iommu;
4675 ret = iommu_unregister_device_fault_handler(dev);
4679 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4681 iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4686 static int intel_iommu_enable_iopf(struct device *dev)
4688 struct device_domain_info *info = dev_iommu_priv_get(dev);
4690 if (info && info->pri_supported)
4697 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4700 case IOMMU_DEV_FEAT_IOPF:
4701 return intel_iommu_enable_iopf(dev);
4703 case IOMMU_DEV_FEAT_SVA:
4704 return intel_iommu_enable_sva(dev);
4712 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4715 case IOMMU_DEV_FEAT_IOPF:
4718 case IOMMU_DEV_FEAT_SVA:
4719 return intel_iommu_disable_sva(dev);
4726 static bool intel_iommu_is_attach_deferred(struct device *dev)
4728 struct device_domain_info *info = dev_iommu_priv_get(dev);
4730 return translation_pre_enabled(info->iommu) && !info->domain;
4734 * Check that the device does not live on an external facing PCI port that is
4735 * marked as untrusted. Such devices should not be able to apply quirks and
4736 * thus not be able to bypass the IOMMU restrictions.
4738 static bool risky_device(struct pci_dev *pdev)
4740 if (pdev->untrusted) {
4742 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4743 pdev->vendor, pdev->device);
4744 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4750 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4751 unsigned long iova, size_t size)
4753 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4754 unsigned long pages = aligned_nrpages(iova, size);
4755 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4756 struct iommu_domain_info *info;
4759 xa_for_each(&dmar_domain->iommu_array, i, info)
4760 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4763 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4765 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4766 struct iommu_domain *domain;
4768 /* Domain type specific cleanup: */
4769 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4771 switch (domain->type) {
4772 case IOMMU_DOMAIN_SVA:
4773 intel_svm_remove_dev_pasid(dev, pasid);
4776 /* should never reach here */
4782 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4785 const struct iommu_ops intel_iommu_ops = {
4786 .capable = intel_iommu_capable,
4787 .domain_alloc = intel_iommu_domain_alloc,
4788 .probe_device = intel_iommu_probe_device,
4789 .probe_finalize = intel_iommu_probe_finalize,
4790 .release_device = intel_iommu_release_device,
4791 .get_resv_regions = intel_iommu_get_resv_regions,
4792 .device_group = intel_iommu_device_group,
4793 .dev_enable_feat = intel_iommu_dev_enable_feat,
4794 .dev_disable_feat = intel_iommu_dev_disable_feat,
4795 .is_attach_deferred = intel_iommu_is_attach_deferred,
4796 .def_domain_type = device_def_domain_type,
4797 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4798 .pgsize_bitmap = SZ_4K,
4799 #ifdef CONFIG_INTEL_IOMMU_SVM
4800 .page_response = intel_svm_page_response,
4802 .default_domain_ops = &(const struct iommu_domain_ops) {
4803 .attach_dev = intel_iommu_attach_device,
4804 .map_pages = intel_iommu_map_pages,
4805 .unmap_pages = intel_iommu_unmap_pages,
4806 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4807 .flush_iotlb_all = intel_flush_iotlb_all,
4808 .iotlb_sync = intel_iommu_tlb_sync,
4809 .iova_to_phys = intel_iommu_iova_to_phys,
4810 .free = intel_iommu_domain_free,
4811 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4815 static void quirk_iommu_igfx(struct pci_dev *dev)
4817 if (risky_device(dev))
4820 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4824 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4828 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4830 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4833 /* Broadwell igfx malfunctions with dmar */
4834 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4835 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4846 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4859 static void quirk_iommu_rwbf(struct pci_dev *dev)
4861 if (risky_device(dev))
4865 * Mobile 4 Series Chipset neglects to set RWBF capability,
4866 * but needs it. Same seems to hold for the desktop versions.
4868 pci_info(dev, "Forcing write-buffer flush capability\n");
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4881 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4882 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4883 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4884 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4885 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4886 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4887 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4888 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4890 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4894 if (risky_device(dev))
4897 if (pci_read_config_word(dev, GGC, &ggc))
4900 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4901 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4903 } else if (dmar_map_gfx) {
4904 /* we have to ensure the gfx device is idle before we flush */
4905 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4906 iommu_set_dma_strict();
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4912 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4914 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4918 if (!IS_GFX_DEVICE(dev))
4921 ver = (dev->device >> 8) & 0xff;
4922 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4923 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4924 ver != 0x9a && ver != 0xa7)
4927 if (risky_device(dev))
4930 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4931 iommu_skip_te_disable = 1;
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4935 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4936 ISOCH DMAR unit for the Azalia sound device, but not give it any
4937 TLB entries, which causes it to deadlock. Check for that. We do
4938 this in a function called from init_dmars(), instead of in a PCI
4939 quirk, because we don't want to print the obnoxious "BIOS broken"
4940 message if VT-d is actually disabled.
4942 static void __init check_tylersburg_isoch(void)
4944 struct pci_dev *pdev;
4945 uint32_t vtisochctrl;
4947 /* If there's no Azalia in the system anyway, forget it. */
4948 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4952 if (risky_device(pdev)) {
4959 /* System Management Registers. Might be hidden, in which case
4960 we can't do the sanity check. But that's OK, because the
4961 known-broken BIOSes _don't_ actually hide it, so far. */
4962 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4966 if (risky_device(pdev)) {
4971 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4978 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4979 if (vtisochctrl & 1)
4982 /* Drop all bits other than the number of TLB entries */
4983 vtisochctrl &= 0x1c;
4985 /* If we have the recommended number of TLB entries (16), fine. */
4986 if (vtisochctrl == 0x10)
4989 /* Zero TLB entries? You get to ride the short bus to school. */
4991 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4992 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4993 dmi_get_system_info(DMI_BIOS_VENDOR),
4994 dmi_get_system_info(DMI_BIOS_VERSION),
4995 dmi_get_system_info(DMI_PRODUCT_VERSION));
4996 iommu_identity_mapping |= IDENTMAP_AZALIA;
5000 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5005 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5006 * invalidation completion before posted writes initiated with translated address
5007 * that utilized translations matching the invalidation address range, violating
5008 * the invalidation completion ordering.
5009 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5010 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5011 * under the control of the trusted/privileged host device driver must use this
5013 * Device TLBs are invalidated under the following six conditions:
5014 * 1. Device driver does DMA API unmap IOVA
5015 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5016 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5017 * exit_mmap() due to crash
5018 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5019 * VM has to free pages that were unmapped
5020 * 5. Userspace driver unmaps a DMA buffer
5021 * 6. Cache invalidation in vSVA usage (upcoming)
5023 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5024 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5025 * invalidate TLB the same way as normal user unmap which will use this quirk.
5026 * The dTLB invalidation after PASID cache flush does not need this quirk.
5028 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5030 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5031 unsigned long address, unsigned long mask,
5032 u32 pasid, u16 qdep)
5036 if (likely(!info->dtlb_extra_inval))
5039 sid = PCI_DEVID(info->bus, info->devfn);
5040 if (pasid == PASID_RID2PASID) {
5041 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5042 qdep, address, mask);
5044 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5045 pasid, qdep, address, mask);
5049 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5052 * Function to submit a command to the enhanced command interface. The
5053 * valid enhanced command descriptions are defined in Table 47 of the
5054 * VT-d spec. The VT-d hardware implementation may support some but not
5055 * all commands, which can be determined by checking the Enhanced
5056 * Command Capability Register.
5059 * - 0: Command successful without any error;
5060 * - Negative: software error value;
5061 * - Nonzero positive: failure status code defined in Table 48.
5063 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5065 unsigned long flags;
5069 if (!cap_ecmds(iommu->cap))
5072 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5074 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5075 if (res & DMA_ECMD_ECRSP_IP) {
5081 * Unconditionally write the operand B, because
5082 * - There is no side effect if an ecmd doesn't require an
5083 * operand B, but we set the register to some value.
5084 * - It's not invoked in any critical path. The extra MMIO
5085 * write doesn't bring any performance concerns.
5087 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5088 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5090 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5091 !(res & DMA_ECMD_ECRSP_IP), res);
5093 if (res & DMA_ECMD_ECRSP_IP) {
5098 ret = ecmd_get_status_code(res);
5100 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);