1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-iommu.h>
21 #include <linux/intel-svm.h>
22 #include <linux/memory.h>
23 #include <linux/pci.h>
24 #include <linux/pci-ats.h>
25 #include <linux/spinlock.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/tboot.h>
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
32 #include "cap_audit.h"
34 #define ROOT_SIZE VTD_PAGE_SIZE
35 #define CONTEXT_SIZE VTD_PAGE_SIZE
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 #define IOAPIC_RANGE_START (0xfee00000)
43 #define IOAPIC_RANGE_END (0xfeefffff)
44 #define IOVA_START_ADDR (0x1000)
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN (1)
63 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
65 /* page table handling */
66 #define LEVEL_STRIDE (9)
67 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
69 static inline int agaw_to_level(int agaw)
74 static inline int agaw_to_width(int agaw)
76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
79 static inline int width_to_agaw(int width)
81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
84 static inline unsigned int level_to_offset_bits(int level)
86 return (level - 1) * LEVEL_STRIDE;
89 static inline int pfn_level_offset(u64 pfn, int level)
91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
94 static inline u64 level_mask(int level)
96 return -1ULL << level_to_offset_bits(level);
99 static inline u64 level_size(int level)
101 return 1ULL << level_to_offset_bits(level);
104 static inline u64 align_to_level(u64 pfn, int level)
106 return (pfn + level_size(level) - 1) & level_mask(level);
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115 are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
122 return mm_to_dma_pfn(page_to_pfn(pg));
124 static inline unsigned long virt_to_dma_pfn(void *p)
126 return page_to_dma_pfn(virt_to_page(p));
129 /* global iommu list, set NULL for ignored DMAR units */
130 static struct intel_iommu **g_iommus;
132 static void __init check_tylersburg_isoch(void);
133 static int rwbf_quirk;
134 static inline struct device_domain_info *
135 dmar_search_domain_by_dev_info(int segment, int bus, int devfn);
138 * set to 1 to panic kernel if can't successfully enable VT-d
139 * (used when kernel is launched w/ TXT)
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
148 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
156 return re->lo & VTD_PAGE_MASK;
160 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
168 return re->hi & VTD_PAGE_MASK;
171 static inline void context_clear_pasid_enable(struct context_entry *context)
173 context->lo &= ~(1ULL << 11);
176 static inline bool context_pasid_enabled(struct context_entry *context)
178 return !!(context->lo & (1ULL << 11));
181 static inline void context_set_copied(struct context_entry *context)
183 context->hi |= (1ull << 3);
186 static inline bool context_copied(struct context_entry *context)
188 return !!(context->hi & (1ULL << 3));
191 static inline bool __context_present(struct context_entry *context)
193 return (context->lo & 1);
196 bool context_present(struct context_entry *context)
198 return context_pasid_enabled(context) ?
199 __context_present(context) :
200 __context_present(context) && !context_copied(context);
203 static inline void context_set_present(struct context_entry *context)
208 static inline void context_set_fault_enable(struct context_entry *context)
210 context->lo &= (((u64)-1) << 2) | 1;
213 static inline void context_set_translation_type(struct context_entry *context,
216 context->lo &= (((u64)-1) << 4) | 3;
217 context->lo |= (value & 3) << 2;
220 static inline void context_set_address_root(struct context_entry *context,
223 context->lo &= ~VTD_PAGE_MASK;
224 context->lo |= value & VTD_PAGE_MASK;
227 static inline void context_set_address_width(struct context_entry *context,
230 context->hi |= value & 7;
233 static inline void context_set_domain_id(struct context_entry *context,
236 context->hi |= (value & ((1 << 16) - 1)) << 8;
239 static inline int context_domain_id(struct context_entry *c)
241 return((c->hi >> 8) & 0xffff);
244 static inline void context_clear_entry(struct context_entry *context)
251 * This domain is a statically identity mapping domain.
252 * 1. This domain creats a static 1:1 mapping to all usable memory.
253 * 2. It maps to each iommu if successful.
254 * 3. Each iommu mapps to this domain if successful.
256 static struct dmar_domain *si_domain;
257 static int hw_pass_through = 1;
259 #define for_each_domain_iommu(idx, domain) \
260 for (idx = 0; idx < g_num_of_iommus; idx++) \
261 if (domain->iommu_refcnt[idx])
263 struct dmar_rmrr_unit {
264 struct list_head list; /* list of rmrr units */
265 struct acpi_dmar_header *hdr; /* ACPI header */
266 u64 base_address; /* reserved base address*/
267 u64 end_address; /* reserved end address */
268 struct dmar_dev_scope *devices; /* target devices */
269 int devices_cnt; /* target device count */
272 struct dmar_atsr_unit {
273 struct list_head list; /* list of ATSR units */
274 struct acpi_dmar_header *hdr; /* ACPI header */
275 struct dmar_dev_scope *devices; /* target devices */
276 int devices_cnt; /* target device count */
277 u8 include_all:1; /* include all ports */
280 struct dmar_satc_unit {
281 struct list_head list; /* list of SATC units */
282 struct acpi_dmar_header *hdr; /* ACPI header */
283 struct dmar_dev_scope *devices; /* target devices */
284 struct intel_iommu *iommu; /* the corresponding iommu */
285 int devices_cnt; /* target device count */
286 u8 atc_required:1; /* ATS is required */
289 static LIST_HEAD(dmar_atsr_units);
290 static LIST_HEAD(dmar_rmrr_units);
291 static LIST_HEAD(dmar_satc_units);
293 #define for_each_rmrr_units(rmrr) \
294 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
296 /* bitmap for indexing intel_iommus */
297 static int g_num_of_iommus;
299 static void domain_remove_dev_info(struct dmar_domain *domain);
300 static void dmar_remove_one_dev_info(struct device *dev);
301 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
303 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
304 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
306 int intel_iommu_enabled = 0;
307 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
309 static int dmar_map_gfx = 1;
310 static int intel_iommu_superpage = 1;
311 static int iommu_identity_mapping;
312 static int iommu_skip_te_disable;
314 #define IDENTMAP_GFX 2
315 #define IDENTMAP_AZALIA 4
317 int intel_iommu_gfx_mapped;
318 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
320 DEFINE_SPINLOCK(device_domain_lock);
321 static LIST_HEAD(device_domain_list);
323 const struct iommu_ops intel_iommu_ops;
325 static bool translation_pre_enabled(struct intel_iommu *iommu)
327 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
330 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
332 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
335 static void init_translation_status(struct intel_iommu *iommu)
339 gsts = readl(iommu->reg + DMAR_GSTS_REG);
340 if (gsts & DMA_GSTS_TES)
341 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
344 static int __init intel_iommu_setup(char *str)
350 if (!strncmp(str, "on", 2)) {
352 pr_info("IOMMU enabled\n");
353 } else if (!strncmp(str, "off", 3)) {
355 no_platform_optin = 1;
356 pr_info("IOMMU disabled\n");
357 } else if (!strncmp(str, "igfx_off", 8)) {
359 pr_info("Disable GFX device mapping\n");
360 } else if (!strncmp(str, "forcedac", 8)) {
361 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
362 iommu_dma_forcedac = true;
363 } else if (!strncmp(str, "strict", 6)) {
364 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
365 iommu_set_dma_strict();
366 } else if (!strncmp(str, "sp_off", 6)) {
367 pr_info("Disable supported super page\n");
368 intel_iommu_superpage = 0;
369 } else if (!strncmp(str, "sm_on", 5)) {
370 pr_info("Enable scalable mode if hardware supports\n");
372 } else if (!strncmp(str, "sm_off", 6)) {
373 pr_info("Scalable mode is disallowed\n");
375 } else if (!strncmp(str, "tboot_noforce", 13)) {
376 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
377 intel_iommu_tboot_noforce = 1;
379 pr_notice("Unknown option - '%s'\n", str);
382 str += strcspn(str, ",");
389 __setup("intel_iommu=", intel_iommu_setup);
391 void *alloc_pgtable_page(int node)
396 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
398 vaddr = page_address(page);
402 void free_pgtable_page(void *vaddr)
404 free_page((unsigned long)vaddr);
407 static inline int domain_type_is_si(struct dmar_domain *domain)
409 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
412 static inline bool domain_use_first_level(struct dmar_domain *domain)
414 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
417 static inline int domain_pfn_supported(struct dmar_domain *domain,
420 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
422 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
425 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
430 sagaw = cap_sagaw(iommu->cap);
431 for (agaw = width_to_agaw(max_gaw);
433 if (test_bit(agaw, &sagaw))
441 * Calculate max SAGAW for each iommu.
443 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
445 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
449 * calculate agaw for each iommu.
450 * "SAGAW" may be different across iommus, use a default agaw, and
451 * get a supported less agaw for iommus that don't support the default agaw.
453 int iommu_calculate_agaw(struct intel_iommu *iommu)
455 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
458 /* This functionin only returns single iommu in a domain */
459 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
463 /* si_domain and vm domain should not get here. */
464 if (WARN_ON(!iommu_is_dma_domain(&domain->domain)))
467 for_each_domain_iommu(iommu_id, domain)
470 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
473 return g_iommus[iommu_id];
476 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
478 return sm_supported(iommu) ?
479 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
482 static void domain_update_iommu_coherency(struct dmar_domain *domain)
484 struct dmar_drhd_unit *drhd;
485 struct intel_iommu *iommu;
489 domain->iommu_coherency = true;
491 for_each_domain_iommu(i, domain) {
493 if (!iommu_paging_structure_coherency(g_iommus[i])) {
494 domain->iommu_coherency = false;
501 /* No hardware attached; use lowest common denominator */
503 for_each_active_iommu(iommu, drhd) {
504 if (!iommu_paging_structure_coherency(iommu)) {
505 domain->iommu_coherency = false;
512 static int domain_update_iommu_superpage(struct dmar_domain *domain,
513 struct intel_iommu *skip)
515 struct dmar_drhd_unit *drhd;
516 struct intel_iommu *iommu;
519 if (!intel_iommu_superpage)
522 /* set iommu_superpage to the smallest common denominator */
524 for_each_active_iommu(iommu, drhd) {
526 if (domain && domain_use_first_level(domain)) {
527 if (!cap_fl1gp_support(iommu->cap))
530 mask &= cap_super_page_val(iommu->cap);
542 static int domain_update_device_node(struct dmar_domain *domain)
544 struct device_domain_info *info;
545 int nid = NUMA_NO_NODE;
547 assert_spin_locked(&device_domain_lock);
549 if (list_empty(&domain->devices))
552 list_for_each_entry(info, &domain->devices, link) {
557 * There could possibly be multiple device numa nodes as devices
558 * within the same domain may sit behind different IOMMUs. There
559 * isn't perfect answer in such situation, so we select first
560 * come first served policy.
562 nid = dev_to_node(info->dev);
563 if (nid != NUMA_NO_NODE)
570 static void domain_update_iotlb(struct dmar_domain *domain);
572 /* Return the super pagesize bitmap if supported. */
573 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
575 unsigned long bitmap = 0;
578 * 1-level super page supports page size of 2MiB, 2-level super page
579 * supports page size of both 2MiB and 1GiB.
581 if (domain->iommu_superpage == 1)
583 else if (domain->iommu_superpage == 2)
584 bitmap |= SZ_2M | SZ_1G;
589 /* Some capabilities may be different across iommus */
590 static void domain_update_iommu_cap(struct dmar_domain *domain)
592 domain_update_iommu_coherency(domain);
593 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
596 * If RHSA is missing, we should default to the device numa domain
599 if (domain->nid == NUMA_NO_NODE)
600 domain->nid = domain_update_device_node(domain);
603 * First-level translation restricts the input-address to a
604 * canonical address (i.e., address bits 63:N have the same
605 * value as address bit [N-1], where N is 48-bits with 4-level
606 * paging and 57-bits with 5-level paging). Hence, skip bit
609 if (domain_use_first_level(domain))
610 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
612 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
614 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
615 domain_update_iotlb(domain);
618 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
621 struct root_entry *root = &iommu->root_entry[bus];
622 struct context_entry *context;
626 if (sm_supported(iommu)) {
634 context = phys_to_virt(*entry & VTD_PAGE_MASK);
636 unsigned long phy_addr;
640 context = alloc_pgtable_page(iommu->node);
644 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
645 phy_addr = virt_to_phys((void *)context);
646 *entry = phy_addr | 1;
647 __iommu_flush_cache(iommu, entry, sizeof(*entry));
649 return &context[devfn];
653 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
654 * sub-hierarchy of a candidate PCI-PCI bridge
655 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
656 * @bridge: the candidate PCI-PCI bridge
658 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
661 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
663 struct pci_dev *pdev, *pbridge;
665 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
668 pdev = to_pci_dev(dev);
669 pbridge = to_pci_dev(bridge);
671 if (pbridge->subordinate &&
672 pbridge->subordinate->number <= pdev->bus->number &&
673 pbridge->subordinate->busn_res.end >= pdev->bus->number)
679 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
681 struct dmar_drhd_unit *drhd;
685 /* We know that this device on this chipset has its own IOMMU.
686 * If we find it under a different IOMMU, then the BIOS is lying
687 * to us. Hope that the IOMMU for this device is actually
688 * disabled, and it needs no translation...
690 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
693 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
698 /* we know that the this iommu should be at offset 0xa000 from vtbar */
699 drhd = dmar_find_matched_drhd_unit(pdev);
700 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
701 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
702 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
709 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
711 if (!iommu || iommu->drhd->ignored)
714 if (dev_is_pci(dev)) {
715 struct pci_dev *pdev = to_pci_dev(dev);
717 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
718 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
719 quirk_ioat_snb_local_iommu(pdev))
726 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
728 struct dmar_drhd_unit *drhd = NULL;
729 struct pci_dev *pdev = NULL;
730 struct intel_iommu *iommu;
738 if (dev_is_pci(dev)) {
739 struct pci_dev *pf_pdev;
741 pdev = pci_real_dma_dev(to_pci_dev(dev));
743 /* VFs aren't listed in scope tables; we need to look up
744 * the PF instead to find the IOMMU. */
745 pf_pdev = pci_physfn(pdev);
747 segment = pci_domain_nr(pdev->bus);
748 } else if (has_acpi_companion(dev))
749 dev = &ACPI_COMPANION(dev)->dev;
752 for_each_iommu(iommu, drhd) {
753 if (pdev && segment != drhd->segment)
756 for_each_active_dev_scope(drhd->devices,
757 drhd->devices_cnt, i, tmp) {
759 /* For a VF use its original BDF# not that of the PF
760 * which we used for the IOMMU lookup. Strictly speaking
761 * we could do this for all PCI devices; we only need to
762 * get the BDF# from the scope table for ACPI matches. */
763 if (pdev && pdev->is_virtfn)
767 *bus = drhd->devices[i].bus;
768 *devfn = drhd->devices[i].devfn;
773 if (is_downstream_to_pci_bridge(dev, tmp))
777 if (pdev && drhd->include_all) {
780 *bus = pdev->bus->number;
781 *devfn = pdev->devfn;
788 if (iommu_is_dummy(iommu, dev))
796 static void domain_flush_cache(struct dmar_domain *domain,
797 void *addr, int size)
799 if (!domain->iommu_coherency)
800 clflush_cache_range(addr, size);
803 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
805 struct context_entry *context;
809 spin_lock_irqsave(&iommu->lock, flags);
810 context = iommu_context_addr(iommu, bus, devfn, 0);
812 ret = context_present(context);
813 spin_unlock_irqrestore(&iommu->lock, flags);
817 static void free_context_table(struct intel_iommu *iommu)
821 struct context_entry *context;
823 spin_lock_irqsave(&iommu->lock, flags);
824 if (!iommu->root_entry) {
827 for (i = 0; i < ROOT_ENTRY_NR; i++) {
828 context = iommu_context_addr(iommu, i, 0, 0);
830 free_pgtable_page(context);
832 if (!sm_supported(iommu))
835 context = iommu_context_addr(iommu, i, 0x80, 0);
837 free_pgtable_page(context);
840 free_pgtable_page(iommu->root_entry);
841 iommu->root_entry = NULL;
843 spin_unlock_irqrestore(&iommu->lock, flags);
846 #ifdef CONFIG_DMAR_DEBUG
847 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn)
849 struct device_domain_info *info;
850 struct dma_pte *parent, *pte;
851 struct dmar_domain *domain;
854 info = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
855 if (!info || !info->domain) {
856 pr_info("device [%02x:%02x.%d] not probed\n",
857 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
861 domain = info->domain;
862 level = agaw_to_level(domain->agaw);
863 parent = domain->pgd;
865 pr_info("no page table setup\n");
870 offset = pfn_level_offset(pfn, level);
871 pte = &parent[offset];
872 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
873 pr_info("PTE not present at level %d\n", level);
877 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
882 parent = phys_to_virt(dma_pte_addr(pte));
887 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
888 unsigned long long addr, u32 pasid)
890 struct pasid_dir_entry *dir, *pde;
891 struct pasid_entry *entries, *pte;
892 struct context_entry *ctx_entry;
893 struct root_entry *rt_entry;
894 u8 devfn = source_id & 0xff;
895 u8 bus = source_id >> 8;
896 int i, dir_index, index;
898 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
900 /* root entry dump */
901 rt_entry = &iommu->root_entry[bus];
903 pr_info("root table entry is not present\n");
907 if (sm_supported(iommu))
908 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
909 rt_entry->hi, rt_entry->lo);
911 pr_info("root entry: 0x%016llx", rt_entry->lo);
913 /* context entry dump */
914 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
916 pr_info("context table entry is not present\n");
920 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
921 ctx_entry->hi, ctx_entry->lo);
923 /* legacy mode does not require PASID entries */
924 if (!sm_supported(iommu))
927 /* get the pointer to pasid directory entry */
928 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
930 pr_info("pasid directory entry is not present\n");
933 /* For request-without-pasid, get the pasid from context entry */
934 if (intel_iommu_sm && pasid == INVALID_IOASID)
935 pasid = PASID_RID2PASID;
937 dir_index = pasid >> PASID_PDE_SHIFT;
938 pde = &dir[dir_index];
939 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
941 /* get the pointer to the pasid table entry */
942 entries = get_pasid_table_from_pde(pde);
944 pr_info("pasid table entry is not present\n");
947 index = pasid & PASID_PTE_MASK;
948 pte = &entries[index];
949 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
950 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
953 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn);
957 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
958 unsigned long pfn, int *target_level)
960 struct dma_pte *parent, *pte;
961 int level = agaw_to_level(domain->agaw);
964 BUG_ON(!domain->pgd);
966 if (!domain_pfn_supported(domain, pfn))
967 /* Address beyond IOMMU's addressing capabilities. */
970 parent = domain->pgd;
975 offset = pfn_level_offset(pfn, level);
976 pte = &parent[offset];
977 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
979 if (level == *target_level)
982 if (!dma_pte_present(pte)) {
985 tmp_page = alloc_pgtable_page(domain->nid);
990 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
991 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
992 if (domain_use_first_level(domain)) {
993 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
994 if (iommu_is_dma_domain(&domain->domain))
995 pteval |= DMA_FL_PTE_ACCESS;
997 if (cmpxchg64(&pte->val, 0ULL, pteval))
998 /* Someone else set it while we were thinking; use theirs. */
999 free_pgtable_page(tmp_page);
1001 domain_flush_cache(domain, pte, sizeof(*pte));
1006 parent = phys_to_virt(dma_pte_addr(pte));
1011 *target_level = level;
1016 /* return address's pte at specific level */
1017 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1019 int level, int *large_page)
1021 struct dma_pte *parent, *pte;
1022 int total = agaw_to_level(domain->agaw);
1025 parent = domain->pgd;
1026 while (level <= total) {
1027 offset = pfn_level_offset(pfn, total);
1028 pte = &parent[offset];
1032 if (!dma_pte_present(pte)) {
1033 *large_page = total;
1037 if (dma_pte_superpage(pte)) {
1038 *large_page = total;
1042 parent = phys_to_virt(dma_pte_addr(pte));
1048 /* clear last level pte, a tlb flush should be followed */
1049 static void dma_pte_clear_range(struct dmar_domain *domain,
1050 unsigned long start_pfn,
1051 unsigned long last_pfn)
1053 unsigned int large_page;
1054 struct dma_pte *first_pte, *pte;
1056 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058 BUG_ON(start_pfn > last_pfn);
1060 /* we don't need lock here; nobody else touches the iova range */
1063 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1065 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1070 start_pfn += lvl_to_nr_pages(large_page);
1072 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1074 domain_flush_cache(domain, first_pte,
1075 (void *)pte - (void *)first_pte);
1077 } while (start_pfn && start_pfn <= last_pfn);
1080 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1081 int retain_level, struct dma_pte *pte,
1082 unsigned long pfn, unsigned long start_pfn,
1083 unsigned long last_pfn)
1085 pfn = max(start_pfn, pfn);
1086 pte = &pte[pfn_level_offset(pfn, level)];
1089 unsigned long level_pfn;
1090 struct dma_pte *level_pte;
1092 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1095 level_pfn = pfn & level_mask(level);
1096 level_pte = phys_to_virt(dma_pte_addr(pte));
1099 dma_pte_free_level(domain, level - 1, retain_level,
1100 level_pte, level_pfn, start_pfn,
1105 * Free the page table if we're below the level we want to
1106 * retain and the range covers the entire table.
1108 if (level < retain_level && !(start_pfn > level_pfn ||
1109 last_pfn < level_pfn + level_size(level) - 1)) {
1111 domain_flush_cache(domain, pte, sizeof(*pte));
1112 free_pgtable_page(level_pte);
1115 pfn += level_size(level);
1116 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1120 * clear last level (leaf) ptes and free page table pages below the
1121 * level we wish to keep intact.
1123 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1124 unsigned long start_pfn,
1125 unsigned long last_pfn,
1128 dma_pte_clear_range(domain, start_pfn, last_pfn);
1130 /* We don't need lock here; nobody else touches the iova range */
1131 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1132 domain->pgd, 0, start_pfn, last_pfn);
1135 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1136 free_pgtable_page(domain->pgd);
1141 /* When a page at a given level is being unlinked from its parent, we don't
1142 need to *modify* it at all. All we need to do is make a list of all the
1143 pages which can be freed just as soon as we've flushed the IOTLB and we
1144 know the hardware page-walk will no longer touch them.
1145 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1147 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1148 int level, struct dma_pte *pte,
1149 struct list_head *freelist)
1153 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1154 list_add_tail(&pg->lru, freelist);
1159 pte = page_address(pg);
1161 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1162 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1164 } while (!first_pte_in_page(pte));
1167 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1168 struct dma_pte *pte, unsigned long pfn,
1169 unsigned long start_pfn, unsigned long last_pfn,
1170 struct list_head *freelist)
1172 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1174 pfn = max(start_pfn, pfn);
1175 pte = &pte[pfn_level_offset(pfn, level)];
1178 unsigned long level_pfn = pfn & level_mask(level);
1180 if (!dma_pte_present(pte))
1183 /* If range covers entire pagetable, free it */
1184 if (start_pfn <= level_pfn &&
1185 last_pfn >= level_pfn + level_size(level) - 1) {
1186 /* These suborbinate page tables are going away entirely. Don't
1187 bother to clear them; we're just going to *free* them. */
1188 if (level > 1 && !dma_pte_superpage(pte))
1189 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1195 } else if (level > 1) {
1196 /* Recurse down into a level that isn't *entirely* obsolete */
1197 dma_pte_clear_level(domain, level - 1,
1198 phys_to_virt(dma_pte_addr(pte)),
1199 level_pfn, start_pfn, last_pfn,
1203 pfn = level_pfn + level_size(level);
1204 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1207 domain_flush_cache(domain, first_pte,
1208 (void *)++last_pte - (void *)first_pte);
1211 /* We can't just free the pages because the IOMMU may still be walking
1212 the page tables, and may have cached the intermediate levels. The
1213 pages can only be freed after the IOTLB flush has been done. */
1214 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1215 unsigned long last_pfn, struct list_head *freelist)
1217 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1218 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1219 BUG_ON(start_pfn > last_pfn);
1221 /* we don't need lock here; nobody else touches the iova range */
1222 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1223 domain->pgd, 0, start_pfn, last_pfn, freelist);
1226 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1227 struct page *pgd_page = virt_to_page(domain->pgd);
1228 list_add_tail(&pgd_page->lru, freelist);
1233 /* iommu handling */
1234 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1236 struct root_entry *root;
1237 unsigned long flags;
1239 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1241 pr_err("Allocating root entry for %s failed\n",
1246 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1248 spin_lock_irqsave(&iommu->lock, flags);
1249 iommu->root_entry = root;
1250 spin_unlock_irqrestore(&iommu->lock, flags);
1255 static void iommu_set_root_entry(struct intel_iommu *iommu)
1261 addr = virt_to_phys(iommu->root_entry);
1262 if (sm_supported(iommu))
1263 addr |= DMA_RTADDR_SMT;
1265 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1266 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1268 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1270 /* Make sure hardware complete it */
1271 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1272 readl, (sts & DMA_GSTS_RTPS), sts);
1274 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1276 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1277 if (sm_supported(iommu))
1278 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1279 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1282 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1287 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1290 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1291 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1293 /* Make sure hardware complete it */
1294 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1295 readl, (!(val & DMA_GSTS_WBFS)), val);
1297 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1300 /* return value determine if we need a write buffer flush */
1301 static void __iommu_flush_context(struct intel_iommu *iommu,
1302 u16 did, u16 source_id, u8 function_mask,
1309 case DMA_CCMD_GLOBAL_INVL:
1310 val = DMA_CCMD_GLOBAL_INVL;
1312 case DMA_CCMD_DOMAIN_INVL:
1313 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1315 case DMA_CCMD_DEVICE_INVL:
1316 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1317 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1322 val |= DMA_CCMD_ICC;
1324 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1327 /* Make sure hardware complete it */
1328 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1329 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1331 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1334 /* return value determine if we need a write buffer flush */
1335 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1336 u64 addr, unsigned int size_order, u64 type)
1338 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1339 u64 val = 0, val_iva = 0;
1343 case DMA_TLB_GLOBAL_FLUSH:
1344 /* global flush doesn't need set IVA_REG */
1345 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1347 case DMA_TLB_DSI_FLUSH:
1348 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1350 case DMA_TLB_PSI_FLUSH:
1351 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1352 /* IH bit is passed in as part of address */
1353 val_iva = size_order | addr;
1358 /* Note: set drain read/write */
1361 * This is probably to be super secure.. Looks like we can
1362 * ignore it without any impact.
1364 if (cap_read_drain(iommu->cap))
1365 val |= DMA_TLB_READ_DRAIN;
1367 if (cap_write_drain(iommu->cap))
1368 val |= DMA_TLB_WRITE_DRAIN;
1370 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1371 /* Note: Only uses first TLB reg currently */
1373 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1374 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1376 /* Make sure hardware complete it */
1377 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1378 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1380 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1382 /* check IOTLB invalidation granularity */
1383 if (DMA_TLB_IAIG(val) == 0)
1384 pr_err("Flush IOTLB failed\n");
1385 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1386 pr_debug("TLB flush request %Lx, actual %Lx\n",
1387 (unsigned long long)DMA_TLB_IIRG(type),
1388 (unsigned long long)DMA_TLB_IAIG(val));
1391 static struct device_domain_info *
1392 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1395 struct device_domain_info *info;
1397 assert_spin_locked(&device_domain_lock);
1402 list_for_each_entry(info, &domain->devices, link)
1403 if (info->iommu == iommu && info->bus == bus &&
1404 info->devfn == devfn) {
1405 if (info->ats_supported && info->dev)
1413 static void domain_update_iotlb(struct dmar_domain *domain)
1415 struct device_domain_info *info;
1416 bool has_iotlb_device = false;
1418 assert_spin_locked(&device_domain_lock);
1420 list_for_each_entry(info, &domain->devices, link)
1421 if (info->ats_enabled) {
1422 has_iotlb_device = true;
1426 domain->has_iotlb_device = has_iotlb_device;
1429 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1431 struct pci_dev *pdev;
1433 assert_spin_locked(&device_domain_lock);
1435 if (!info || !dev_is_pci(info->dev))
1438 pdev = to_pci_dev(info->dev);
1439 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1440 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1441 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1442 * reserved, which should be set to 0.
1444 if (!ecap_dit(info->iommu->ecap))
1447 struct pci_dev *pf_pdev;
1449 /* pdev will be returned if device is not a vf */
1450 pf_pdev = pci_physfn(pdev);
1451 info->pfsid = pci_dev_id(pf_pdev);
1454 #ifdef CONFIG_INTEL_IOMMU_SVM
1455 /* The PCIe spec, in its wisdom, declares that the behaviour of
1456 the device if you enable PASID support after ATS support is
1457 undefined. So always enable PASID support on devices which
1458 have it, even if we can't yet know if we're ever going to
1460 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1461 info->pasid_enabled = 1;
1463 if (info->pri_supported &&
1464 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1465 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1466 info->pri_enabled = 1;
1468 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1469 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1470 info->ats_enabled = 1;
1471 domain_update_iotlb(info->domain);
1472 info->ats_qdep = pci_ats_queue_depth(pdev);
1476 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1478 struct pci_dev *pdev;
1480 assert_spin_locked(&device_domain_lock);
1482 if (!dev_is_pci(info->dev))
1485 pdev = to_pci_dev(info->dev);
1487 if (info->ats_enabled) {
1488 pci_disable_ats(pdev);
1489 info->ats_enabled = 0;
1490 domain_update_iotlb(info->domain);
1492 #ifdef CONFIG_INTEL_IOMMU_SVM
1493 if (info->pri_enabled) {
1494 pci_disable_pri(pdev);
1495 info->pri_enabled = 0;
1497 if (info->pasid_enabled) {
1498 pci_disable_pasid(pdev);
1499 info->pasid_enabled = 0;
1504 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1505 u64 addr, unsigned int mask)
1509 if (!info || !info->ats_enabled)
1512 sid = info->bus << 8 | info->devfn;
1513 qdep = info->ats_qdep;
1514 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1518 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1519 u64 addr, unsigned mask)
1521 unsigned long flags;
1522 struct device_domain_info *info;
1524 if (!domain->has_iotlb_device)
1527 spin_lock_irqsave(&device_domain_lock, flags);
1528 list_for_each_entry(info, &domain->devices, link)
1529 __iommu_flush_dev_iotlb(info, addr, mask);
1531 spin_unlock_irqrestore(&device_domain_lock, flags);
1534 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1535 struct dmar_domain *domain,
1536 unsigned long pfn, unsigned int pages,
1539 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1540 unsigned int mask = ilog2(aligned_pages);
1541 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1542 u16 did = domain->iommu_did[iommu->seq_id];
1549 if (domain_use_first_level(domain)) {
1550 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1552 unsigned long bitmask = aligned_pages - 1;
1555 * PSI masks the low order bits of the base address. If the
1556 * address isn't aligned to the mask, then compute a mask value
1557 * needed to ensure the target range is flushed.
1559 if (unlikely(bitmask & pfn)) {
1560 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1563 * Since end_pfn <= pfn + bitmask, the only way bits
1564 * higher than bitmask can differ in pfn and end_pfn is
1565 * by carrying. This means after masking out bitmask,
1566 * high bits starting with the first set bit in
1567 * shared_bits are all equal in both pfn and end_pfn.
1569 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1570 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1574 * Fallback to domain selective flush if no PSI support or
1575 * the size is too big.
1577 if (!cap_pgsel_inv(iommu->cap) ||
1578 mask > cap_max_amask_val(iommu->cap))
1579 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1582 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1587 * In caching mode, changes of pages from non-present to present require
1588 * flush. However, device IOTLB doesn't need to be flushed in this case.
1590 if (!cap_caching_mode(iommu->cap) || !map)
1591 iommu_flush_dev_iotlb(domain, addr, mask);
1594 /* Notification for newly created mappings */
1595 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1596 struct dmar_domain *domain,
1597 unsigned long pfn, unsigned int pages)
1600 * It's a non-present to present mapping. Only flush if caching mode
1603 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1604 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1606 iommu_flush_write_buffer(iommu);
1609 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1611 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1614 for_each_domain_iommu(idx, dmar_domain) {
1615 struct intel_iommu *iommu = g_iommus[idx];
1616 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1618 if (domain_use_first_level(dmar_domain))
1619 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1621 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1624 if (!cap_caching_mode(iommu->cap))
1625 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1629 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1632 unsigned long flags;
1634 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1637 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1639 pmen &= ~DMA_PMEN_EPM;
1640 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1642 /* wait for the protected region status bit to clear */
1643 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1644 readl, !(pmen & DMA_PMEN_PRS), pmen);
1646 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1649 static void iommu_enable_translation(struct intel_iommu *iommu)
1652 unsigned long flags;
1654 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1655 iommu->gcmd |= DMA_GCMD_TE;
1656 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1658 /* Make sure hardware complete it */
1659 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1660 readl, (sts & DMA_GSTS_TES), sts);
1662 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1665 static void iommu_disable_translation(struct intel_iommu *iommu)
1670 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1671 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1674 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1675 iommu->gcmd &= ~DMA_GCMD_TE;
1676 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1678 /* Make sure hardware complete it */
1679 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1680 readl, (!(sts & DMA_GSTS_TES)), sts);
1682 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1685 static int iommu_init_domains(struct intel_iommu *iommu)
1689 ndomains = cap_ndoms(iommu->cap);
1690 pr_debug("%s: Number of Domains supported <%d>\n",
1691 iommu->name, ndomains);
1693 spin_lock_init(&iommu->lock);
1695 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1696 if (!iommu->domain_ids)
1700 * If Caching mode is set, then invalid translations are tagged
1701 * with domain-id 0, hence we need to pre-allocate it. We also
1702 * use domain-id 0 as a marker for non-allocated domain-id, so
1703 * make sure it is not used for a real domain.
1705 set_bit(0, iommu->domain_ids);
1708 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1709 * entry for first-level or pass-through translation modes should
1710 * be programmed with a domain id different from those used for
1711 * second-level or nested translation. We reserve a domain id for
1714 if (sm_supported(iommu))
1715 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1720 static void disable_dmar_iommu(struct intel_iommu *iommu)
1722 struct device_domain_info *info, *tmp;
1723 unsigned long flags;
1725 if (!iommu->domain_ids)
1728 spin_lock_irqsave(&device_domain_lock, flags);
1729 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1730 if (info->iommu != iommu)
1733 if (!info->dev || !info->domain)
1736 __dmar_remove_one_dev_info(info);
1738 spin_unlock_irqrestore(&device_domain_lock, flags);
1740 if (iommu->gcmd & DMA_GCMD_TE)
1741 iommu_disable_translation(iommu);
1744 static void free_dmar_iommu(struct intel_iommu *iommu)
1746 if (iommu->domain_ids) {
1747 bitmap_free(iommu->domain_ids);
1748 iommu->domain_ids = NULL;
1751 g_iommus[iommu->seq_id] = NULL;
1753 /* free context mapping */
1754 free_context_table(iommu);
1756 #ifdef CONFIG_INTEL_IOMMU_SVM
1757 if (pasid_supported(iommu)) {
1758 if (ecap_prs(iommu->ecap))
1759 intel_svm_finish_prq(iommu);
1761 if (vccap_pasid(iommu->vccap))
1762 ioasid_unregister_allocator(&iommu->pasid_allocator);
1768 * Check and return whether first level is used by default for
1771 static bool first_level_by_default(unsigned int type)
1773 /* Only SL is available in legacy mode */
1774 if (!scalable_mode_support())
1777 /* Only level (either FL or SL) is available, just use it */
1778 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1779 return intel_cap_flts_sanity();
1781 /* Both levels are available, decide it based on domain type */
1782 return type != IOMMU_DOMAIN_UNMANAGED;
1785 static struct dmar_domain *alloc_domain(unsigned int type)
1787 struct dmar_domain *domain;
1789 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1793 domain->nid = NUMA_NO_NODE;
1794 if (first_level_by_default(type))
1795 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1796 domain->has_iotlb_device = false;
1797 INIT_LIST_HEAD(&domain->devices);
1802 /* Must be called with iommu->lock */
1803 static int domain_attach_iommu(struct dmar_domain *domain,
1804 struct intel_iommu *iommu)
1806 unsigned long ndomains;
1809 assert_spin_locked(&device_domain_lock);
1810 assert_spin_locked(&iommu->lock);
1812 domain->iommu_refcnt[iommu->seq_id] += 1;
1813 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1814 ndomains = cap_ndoms(iommu->cap);
1815 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1817 if (num >= ndomains) {
1818 pr_err("%s: No free domain ids\n", iommu->name);
1819 domain->iommu_refcnt[iommu->seq_id] -= 1;
1823 set_bit(num, iommu->domain_ids);
1824 domain->iommu_did[iommu->seq_id] = num;
1825 domain->nid = iommu->node;
1826 domain_update_iommu_cap(domain);
1832 static void domain_detach_iommu(struct dmar_domain *domain,
1833 struct intel_iommu *iommu)
1837 assert_spin_locked(&device_domain_lock);
1838 assert_spin_locked(&iommu->lock);
1840 domain->iommu_refcnt[iommu->seq_id] -= 1;
1841 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1842 num = domain->iommu_did[iommu->seq_id];
1843 clear_bit(num, iommu->domain_ids);
1844 domain_update_iommu_cap(domain);
1845 domain->iommu_did[iommu->seq_id] = 0;
1849 static inline int guestwidth_to_adjustwidth(int gaw)
1852 int r = (gaw - 12) % 9;
1863 static void domain_exit(struct dmar_domain *domain)
1866 /* Remove associated devices and clear attached or cached domains */
1867 domain_remove_dev_info(domain);
1870 LIST_HEAD(freelist);
1872 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1873 put_pages_list(&freelist);
1880 * Get the PASID directory size for scalable mode context entry.
1881 * Value of X in the PDTS field of a scalable mode context entry
1882 * indicates PASID directory with 2^(X + 7) entries.
1884 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1886 unsigned long pds, max_pde;
1888 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1889 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1897 * Set the RID_PASID field of a scalable mode context entry. The
1898 * IOMMU hardware will use the PASID value set in this field for
1899 * DMA translations of DMA requests without PASID.
1902 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1904 context->hi |= pasid & ((1 << 20) - 1);
1908 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1911 static inline void context_set_sm_dte(struct context_entry *context)
1913 context->lo |= (1 << 2);
1917 * Set the PRE(Page Request Enable) field of a scalable mode context
1920 static inline void context_set_sm_pre(struct context_entry *context)
1922 context->lo |= (1 << 4);
1925 /* Convert value to context PASID directory size field coding. */
1926 #define context_pdts(pds) (((pds) & 0x7) << 9)
1928 static int domain_context_mapping_one(struct dmar_domain *domain,
1929 struct intel_iommu *iommu,
1930 struct pasid_table *table,
1933 u16 did = domain->iommu_did[iommu->seq_id];
1934 int translation = CONTEXT_TT_MULTI_LEVEL;
1935 struct device_domain_info *info = NULL;
1936 struct context_entry *context;
1937 unsigned long flags;
1942 if (hw_pass_through && domain_type_is_si(domain))
1943 translation = CONTEXT_TT_PASS_THROUGH;
1945 pr_debug("Set context mapping for %02x:%02x.%d\n",
1946 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1948 BUG_ON(!domain->pgd);
1950 spin_lock_irqsave(&device_domain_lock, flags);
1951 spin_lock(&iommu->lock);
1954 context = iommu_context_addr(iommu, bus, devfn, 1);
1959 if (context_present(context))
1963 * For kdump cases, old valid entries may be cached due to the
1964 * in-flight DMA and copied pgtable, but there is no unmapping
1965 * behaviour for them, thus we need an explicit cache flush for
1966 * the newly-mapped device. For kdump, at this point, the device
1967 * is supposed to finish reset at its driver probe stage, so no
1968 * in-flight DMA will exist, and we don't need to worry anymore
1971 if (context_copied(context)) {
1972 u16 did_old = context_domain_id(context);
1974 if (did_old < cap_ndoms(iommu->cap)) {
1975 iommu->flush.flush_context(iommu, did_old,
1976 (((u16)bus) << 8) | devfn,
1977 DMA_CCMD_MASK_NOBIT,
1978 DMA_CCMD_DEVICE_INVL);
1979 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1984 context_clear_entry(context);
1986 if (sm_supported(iommu)) {
1991 /* Setup the PASID DIR pointer: */
1992 pds = context_get_sm_pds(table);
1993 context->lo = (u64)virt_to_phys(table->table) |
1996 /* Setup the RID_PASID field: */
1997 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2000 * Setup the Device-TLB enable bit and Page request
2003 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2004 if (info && info->ats_supported)
2005 context_set_sm_dte(context);
2006 if (info && info->pri_supported)
2007 context_set_sm_pre(context);
2009 struct dma_pte *pgd = domain->pgd;
2012 context_set_domain_id(context, did);
2014 if (translation != CONTEXT_TT_PASS_THROUGH) {
2016 * Skip top levels of page tables for iommu which has
2017 * less agaw than default. Unnecessary for PT mode.
2019 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2021 pgd = phys_to_virt(dma_pte_addr(pgd));
2022 if (!dma_pte_present(pgd))
2026 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2027 if (info && info->ats_supported)
2028 translation = CONTEXT_TT_DEV_IOTLB;
2030 translation = CONTEXT_TT_MULTI_LEVEL;
2032 context_set_address_root(context, virt_to_phys(pgd));
2033 context_set_address_width(context, agaw);
2036 * In pass through mode, AW must be programmed to
2037 * indicate the largest AGAW value supported by
2038 * hardware. And ASR is ignored by hardware.
2040 context_set_address_width(context, iommu->msagaw);
2043 context_set_translation_type(context, translation);
2046 context_set_fault_enable(context);
2047 context_set_present(context);
2048 if (!ecap_coherent(iommu->ecap))
2049 clflush_cache_range(context, sizeof(*context));
2052 * It's a non-present to present mapping. If hardware doesn't cache
2053 * non-present entry we only need to flush the write-buffer. If the
2054 * _does_ cache non-present entries, then it does so in the special
2055 * domain #0, which we have to flush:
2057 if (cap_caching_mode(iommu->cap)) {
2058 iommu->flush.flush_context(iommu, 0,
2059 (((u16)bus) << 8) | devfn,
2060 DMA_CCMD_MASK_NOBIT,
2061 DMA_CCMD_DEVICE_INVL);
2062 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2064 iommu_flush_write_buffer(iommu);
2066 iommu_enable_dev_iotlb(info);
2071 spin_unlock(&iommu->lock);
2072 spin_unlock_irqrestore(&device_domain_lock, flags);
2077 struct domain_context_mapping_data {
2078 struct dmar_domain *domain;
2079 struct intel_iommu *iommu;
2080 struct pasid_table *table;
2083 static int domain_context_mapping_cb(struct pci_dev *pdev,
2084 u16 alias, void *opaque)
2086 struct domain_context_mapping_data *data = opaque;
2088 return domain_context_mapping_one(data->domain, data->iommu,
2089 data->table, PCI_BUS_NUM(alias),
2094 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2096 struct domain_context_mapping_data data;
2097 struct pasid_table *table;
2098 struct intel_iommu *iommu;
2101 iommu = device_to_iommu(dev, &bus, &devfn);
2105 table = intel_pasid_get_table(dev);
2107 if (!dev_is_pci(dev))
2108 return domain_context_mapping_one(domain, iommu, table,
2111 data.domain = domain;
2115 return pci_for_each_dma_alias(to_pci_dev(dev),
2116 &domain_context_mapping_cb, &data);
2119 static int domain_context_mapped_cb(struct pci_dev *pdev,
2120 u16 alias, void *opaque)
2122 struct intel_iommu *iommu = opaque;
2124 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2127 static int domain_context_mapped(struct device *dev)
2129 struct intel_iommu *iommu;
2132 iommu = device_to_iommu(dev, &bus, &devfn);
2136 if (!dev_is_pci(dev))
2137 return device_context_mapped(iommu, bus, devfn);
2139 return !pci_for_each_dma_alias(to_pci_dev(dev),
2140 domain_context_mapped_cb, iommu);
2143 /* Returns a number of VTD pages, but aligned to MM page size */
2144 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2147 host_addr &= ~PAGE_MASK;
2148 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2151 /* Return largest possible superpage level for a given mapping */
2152 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2153 unsigned long iov_pfn,
2154 unsigned long phy_pfn,
2155 unsigned long pages)
2157 int support, level = 1;
2158 unsigned long pfnmerge;
2160 support = domain->iommu_superpage;
2162 /* To use a large page, the virtual *and* physical addresses
2163 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2164 of them will mean we have to use smaller pages. So just
2165 merge them and check both at once. */
2166 pfnmerge = iov_pfn | phy_pfn;
2168 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2169 pages >>= VTD_STRIDE_SHIFT;
2172 pfnmerge >>= VTD_STRIDE_SHIFT;
2180 * Ensure that old small page tables are removed to make room for superpage(s).
2181 * We're going to add new large pages, so make sure we don't remove their parent
2182 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2184 static void switch_to_super_page(struct dmar_domain *domain,
2185 unsigned long start_pfn,
2186 unsigned long end_pfn, int level)
2188 unsigned long lvl_pages = lvl_to_nr_pages(level);
2189 struct dma_pte *pte = NULL;
2192 while (start_pfn <= end_pfn) {
2194 pte = pfn_to_dma_pte(domain, start_pfn, &level);
2196 if (dma_pte_present(pte)) {
2197 dma_pte_free_pagetable(domain, start_pfn,
2198 start_pfn + lvl_pages - 1,
2201 for_each_domain_iommu(i, domain)
2202 iommu_flush_iotlb_psi(g_iommus[i], domain,
2203 start_pfn, lvl_pages,
2208 start_pfn += lvl_pages;
2209 if (first_pte_in_page(pte))
2215 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2216 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2218 struct dma_pte *first_pte = NULL, *pte = NULL;
2219 unsigned int largepage_lvl = 0;
2220 unsigned long lvl_pages = 0;
2224 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2226 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2229 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2230 attr |= DMA_FL_PTE_PRESENT;
2231 if (domain_use_first_level(domain)) {
2232 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2233 if (prot & DMA_PTE_WRITE)
2234 attr |= DMA_FL_PTE_DIRTY;
2237 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2239 while (nr_pages > 0) {
2243 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2244 phys_pfn, nr_pages);
2246 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2251 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2253 /* It is large page*/
2254 if (largepage_lvl > 1) {
2255 unsigned long end_pfn;
2256 unsigned long pages_to_remove;
2258 pteval |= DMA_PTE_LARGE_PAGE;
2259 pages_to_remove = min_t(unsigned long, nr_pages,
2260 nr_pte_to_next_page(pte) * lvl_pages);
2261 end_pfn = iov_pfn + pages_to_remove - 1;
2262 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2264 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2268 /* We don't need lock here, nobody else
2269 * touches the iova range
2271 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2273 static int dumps = 5;
2274 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2275 iov_pfn, tmp, (unsigned long long)pteval);
2278 debug_dma_dump_mappings(NULL);
2283 nr_pages -= lvl_pages;
2284 iov_pfn += lvl_pages;
2285 phys_pfn += lvl_pages;
2286 pteval += lvl_pages * VTD_PAGE_SIZE;
2288 /* If the next PTE would be the first in a new page, then we
2289 * need to flush the cache on the entries we've just written.
2290 * And then we'll need to recalculate 'pte', so clear it and
2291 * let it get set again in the if (!pte) block above.
2293 * If we're done (!nr_pages) we need to flush the cache too.
2295 * Also if we've been setting superpages, we may need to
2296 * recalculate 'pte' and switch back to smaller pages for the
2297 * end of the mapping, if the trailing size is not enough to
2298 * use another superpage (i.e. nr_pages < lvl_pages).
2301 if (!nr_pages || first_pte_in_page(pte) ||
2302 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2303 domain_flush_cache(domain, first_pte,
2304 (void *)pte - (void *)first_pte);
2312 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2314 struct intel_iommu *iommu = info->iommu;
2315 struct context_entry *context;
2316 unsigned long flags;
2322 spin_lock_irqsave(&iommu->lock, flags);
2323 context = iommu_context_addr(iommu, bus, devfn, 0);
2325 spin_unlock_irqrestore(&iommu->lock, flags);
2329 if (sm_supported(iommu)) {
2330 if (hw_pass_through && domain_type_is_si(info->domain))
2331 did_old = FLPT_DEFAULT_DID;
2333 did_old = info->domain->iommu_did[iommu->seq_id];
2335 did_old = context_domain_id(context);
2338 context_clear_entry(context);
2339 __iommu_flush_cache(iommu, context, sizeof(*context));
2340 spin_unlock_irqrestore(&iommu->lock, flags);
2341 iommu->flush.flush_context(iommu,
2343 (((u16)bus) << 8) | devfn,
2344 DMA_CCMD_MASK_NOBIT,
2345 DMA_CCMD_DEVICE_INVL);
2347 if (sm_supported(iommu))
2348 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2350 iommu->flush.flush_iotlb(iommu,
2356 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2359 static void domain_remove_dev_info(struct dmar_domain *domain)
2361 struct device_domain_info *info, *tmp;
2362 unsigned long flags;
2364 spin_lock_irqsave(&device_domain_lock, flags);
2365 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2366 __dmar_remove_one_dev_info(info);
2367 spin_unlock_irqrestore(&device_domain_lock, flags);
2370 static inline struct device_domain_info *
2371 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2373 struct device_domain_info *info;
2375 list_for_each_entry(info, &device_domain_list, global)
2376 if (info->segment == segment && info->bus == bus &&
2377 info->devfn == devfn)
2383 static int domain_setup_first_level(struct intel_iommu *iommu,
2384 struct dmar_domain *domain,
2388 struct dma_pte *pgd = domain->pgd;
2393 * Skip top levels of page tables for iommu which has
2394 * less agaw than default. Unnecessary for PT mode.
2396 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2397 pgd = phys_to_virt(dma_pte_addr(pgd));
2398 if (!dma_pte_present(pgd))
2402 level = agaw_to_level(agaw);
2403 if (level != 4 && level != 5)
2406 if (pasid != PASID_RID2PASID)
2407 flags |= PASID_FLAG_SUPERVISOR_MODE;
2409 flags |= PASID_FLAG_FL5LP;
2411 if (domain->force_snooping)
2412 flags |= PASID_FLAG_PAGE_SNOOP;
2414 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2415 domain->iommu_did[iommu->seq_id],
2419 static bool dev_is_real_dma_subdevice(struct device *dev)
2421 return dev && dev_is_pci(dev) &&
2422 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2425 static int iommu_domain_identity_map(struct dmar_domain *domain,
2426 unsigned long first_vpfn,
2427 unsigned long last_vpfn)
2430 * RMRR range might have overlap with physical memory range,
2433 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2435 return __domain_mapping(domain, first_vpfn,
2436 first_vpfn, last_vpfn - first_vpfn + 1,
2437 DMA_PTE_READ|DMA_PTE_WRITE);
2440 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2442 static int __init si_domain_init(int hw)
2444 struct dmar_rmrr_unit *rmrr;
2448 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2452 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2453 domain_exit(si_domain);
2460 for_each_online_node(nid) {
2461 unsigned long start_pfn, end_pfn;
2464 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2465 ret = iommu_domain_identity_map(si_domain,
2466 mm_to_dma_pfn(start_pfn),
2467 mm_to_dma_pfn(end_pfn));
2474 * Identity map the RMRRs so that devices with RMRRs could also use
2477 for_each_rmrr_units(rmrr) {
2478 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2480 unsigned long long start = rmrr->base_address;
2481 unsigned long long end = rmrr->end_address;
2483 if (WARN_ON(end < start ||
2484 end >> agaw_to_width(si_domain->agaw)))
2487 ret = iommu_domain_identity_map(si_domain,
2488 mm_to_dma_pfn(start >> PAGE_SHIFT),
2489 mm_to_dma_pfn(end >> PAGE_SHIFT));
2498 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2500 struct device_domain_info *info = dev_iommu_priv_get(dev);
2501 struct intel_iommu *iommu;
2502 unsigned long flags;
2506 iommu = device_to_iommu(dev, &bus, &devfn);
2510 spin_lock_irqsave(&device_domain_lock, flags);
2511 info->domain = domain;
2512 spin_lock(&iommu->lock);
2513 ret = domain_attach_iommu(domain, iommu);
2514 spin_unlock(&iommu->lock);
2516 spin_unlock_irqrestore(&device_domain_lock, flags);
2519 list_add(&info->link, &domain->devices);
2520 spin_unlock_irqrestore(&device_domain_lock, flags);
2522 /* PASID table is mandatory for a PCI device in scalable mode. */
2523 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2524 ret = intel_pasid_alloc_table(dev);
2526 dev_err(dev, "PASID table allocation failed\n");
2527 dmar_remove_one_dev_info(dev);
2531 /* Setup the PASID entry for requests without PASID: */
2532 spin_lock_irqsave(&iommu->lock, flags);
2533 if (hw_pass_through && domain_type_is_si(domain))
2534 ret = intel_pasid_setup_pass_through(iommu, domain,
2535 dev, PASID_RID2PASID);
2536 else if (domain_use_first_level(domain))
2537 ret = domain_setup_first_level(iommu, domain, dev,
2540 ret = intel_pasid_setup_second_level(iommu, domain,
2541 dev, PASID_RID2PASID);
2542 spin_unlock_irqrestore(&iommu->lock, flags);
2544 dev_err(dev, "Setup RID2PASID failed\n");
2545 dmar_remove_one_dev_info(dev);
2550 ret = domain_context_mapping(domain, dev);
2552 dev_err(dev, "Domain context map failed\n");
2553 dmar_remove_one_dev_info(dev);
2560 static bool device_has_rmrr(struct device *dev)
2562 struct dmar_rmrr_unit *rmrr;
2567 for_each_rmrr_units(rmrr) {
2569 * Return TRUE if this RMRR contains the device that
2572 for_each_active_dev_scope(rmrr->devices,
2573 rmrr->devices_cnt, i, tmp)
2575 is_downstream_to_pci_bridge(dev, tmp)) {
2585 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2586 * is relaxable (ie. is allowed to be not enforced under some conditions)
2587 * @dev: device handle
2589 * We assume that PCI USB devices with RMRRs have them largely
2590 * for historical reasons and that the RMRR space is not actively used post
2591 * boot. This exclusion may change if vendors begin to abuse it.
2593 * The same exception is made for graphics devices, with the requirement that
2594 * any use of the RMRR regions will be torn down before assigning the device
2597 * Return: true if the RMRR is relaxable, false otherwise
2599 static bool device_rmrr_is_relaxable(struct device *dev)
2601 struct pci_dev *pdev;
2603 if (!dev_is_pci(dev))
2606 pdev = to_pci_dev(dev);
2607 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2614 * There are a couple cases where we need to restrict the functionality of
2615 * devices associated with RMRRs. The first is when evaluating a device for
2616 * identity mapping because problems exist when devices are moved in and out
2617 * of domains and their respective RMRR information is lost. This means that
2618 * a device with associated RMRRs will never be in a "passthrough" domain.
2619 * The second is use of the device through the IOMMU API. This interface
2620 * expects to have full control of the IOVA space for the device. We cannot
2621 * satisfy both the requirement that RMRR access is maintained and have an
2622 * unencumbered IOVA space. We also have no ability to quiesce the device's
2623 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2624 * We therefore prevent devices associated with an RMRR from participating in
2625 * the IOMMU API, which eliminates them from device assignment.
2627 * In both cases, devices which have relaxable RMRRs are not concerned by this
2628 * restriction. See device_rmrr_is_relaxable comment.
2630 static bool device_is_rmrr_locked(struct device *dev)
2632 if (!device_has_rmrr(dev))
2635 if (device_rmrr_is_relaxable(dev))
2642 * Return the required default domain type for a specific device.
2644 * @dev: the device in query
2645 * @startup: true if this is during early boot
2648 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2649 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2650 * - 0: both identity and dynamic domains work for this device
2652 static int device_def_domain_type(struct device *dev)
2654 if (dev_is_pci(dev)) {
2655 struct pci_dev *pdev = to_pci_dev(dev);
2657 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2658 return IOMMU_DOMAIN_IDENTITY;
2660 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2661 return IOMMU_DOMAIN_IDENTITY;
2667 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2670 * Start from the sane iommu hardware state.
2671 * If the queued invalidation is already initialized by us
2672 * (for example, while enabling interrupt-remapping) then
2673 * we got the things already rolling from a sane state.
2677 * Clear any previous faults.
2679 dmar_fault(-1, iommu);
2681 * Disable queued invalidation if supported and already enabled
2682 * before OS handover.
2684 dmar_disable_qi(iommu);
2687 if (dmar_enable_qi(iommu)) {
2689 * Queued Invalidate not enabled, use Register Based Invalidate
2691 iommu->flush.flush_context = __iommu_flush_context;
2692 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2693 pr_info("%s: Using Register based invalidation\n",
2696 iommu->flush.flush_context = qi_flush_context;
2697 iommu->flush.flush_iotlb = qi_flush_iotlb;
2698 pr_info("%s: Using Queued invalidation\n", iommu->name);
2702 static int copy_context_table(struct intel_iommu *iommu,
2703 struct root_entry *old_re,
2704 struct context_entry **tbl,
2707 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2708 struct context_entry *new_ce = NULL, ce;
2709 struct context_entry *old_ce = NULL;
2710 struct root_entry re;
2711 phys_addr_t old_ce_phys;
2713 tbl_idx = ext ? bus * 2 : bus;
2714 memcpy(&re, old_re, sizeof(re));
2716 for (devfn = 0; devfn < 256; devfn++) {
2717 /* First calculate the correct index */
2718 idx = (ext ? devfn * 2 : devfn) % 256;
2721 /* First save what we may have and clean up */
2723 tbl[tbl_idx] = new_ce;
2724 __iommu_flush_cache(iommu, new_ce,
2734 old_ce_phys = root_entry_lctp(&re);
2736 old_ce_phys = root_entry_uctp(&re);
2739 if (ext && devfn == 0) {
2740 /* No LCTP, try UCTP */
2749 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2754 new_ce = alloc_pgtable_page(iommu->node);
2761 /* Now copy the context entry */
2762 memcpy(&ce, old_ce + idx, sizeof(ce));
2764 if (!__context_present(&ce))
2767 did = context_domain_id(&ce);
2768 if (did >= 0 && did < cap_ndoms(iommu->cap))
2769 set_bit(did, iommu->domain_ids);
2772 * We need a marker for copied context entries. This
2773 * marker needs to work for the old format as well as
2774 * for extended context entries.
2776 * Bit 67 of the context entry is used. In the old
2777 * format this bit is available to software, in the
2778 * extended format it is the PGE bit, but PGE is ignored
2779 * by HW if PASIDs are disabled (and thus still
2782 * So disable PASIDs first and then mark the entry
2783 * copied. This means that we don't copy PASID
2784 * translations from the old kernel, but this is fine as
2785 * faults there are not fatal.
2787 context_clear_pasid_enable(&ce);
2788 context_set_copied(&ce);
2793 tbl[tbl_idx + pos] = new_ce;
2795 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2804 static int copy_translation_tables(struct intel_iommu *iommu)
2806 struct context_entry **ctxt_tbls;
2807 struct root_entry *old_rt;
2808 phys_addr_t old_rt_phys;
2809 int ctxt_table_entries;
2810 unsigned long flags;
2815 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2816 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
2817 new_ext = !!ecap_ecs(iommu->ecap);
2820 * The RTT bit can only be changed when translation is disabled,
2821 * but disabling translation means to open a window for data
2822 * corruption. So bail out and don't copy anything if we would
2823 * have to change the bit.
2828 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2832 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2836 /* This is too big for the stack - allocate it from slab */
2837 ctxt_table_entries = ext ? 512 : 256;
2839 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2843 for (bus = 0; bus < 256; bus++) {
2844 ret = copy_context_table(iommu, &old_rt[bus],
2845 ctxt_tbls, bus, ext);
2847 pr_err("%s: Failed to copy context table for bus %d\n",
2853 spin_lock_irqsave(&iommu->lock, flags);
2855 /* Context tables are copied, now write them to the root_entry table */
2856 for (bus = 0; bus < 256; bus++) {
2857 int idx = ext ? bus * 2 : bus;
2860 if (ctxt_tbls[idx]) {
2861 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2862 iommu->root_entry[bus].lo = val;
2865 if (!ext || !ctxt_tbls[idx + 1])
2868 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2869 iommu->root_entry[bus].hi = val;
2872 spin_unlock_irqrestore(&iommu->lock, flags);
2876 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2886 #ifdef CONFIG_INTEL_IOMMU_SVM
2887 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2889 struct intel_iommu *iommu = data;
2893 return INVALID_IOASID;
2895 * VT-d virtual command interface always uses the full 20 bit
2896 * PASID range. Host can partition guest PASID range based on
2897 * policies but it is out of guest's control.
2899 if (min < PASID_MIN || max > intel_pasid_max_id)
2900 return INVALID_IOASID;
2902 if (vcmd_alloc_pasid(iommu, &ioasid))
2903 return INVALID_IOASID;
2908 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2910 struct intel_iommu *iommu = data;
2915 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2916 * We can only free the PASID when all the devices are unbound.
2918 if (ioasid_find(NULL, ioasid, NULL)) {
2919 pr_alert("Cannot free active IOASID %d\n", ioasid);
2922 vcmd_free_pasid(iommu, ioasid);
2925 static void register_pasid_allocator(struct intel_iommu *iommu)
2928 * If we are running in the host, no need for custom allocator
2929 * in that PASIDs are allocated from the host system-wide.
2931 if (!cap_caching_mode(iommu->cap))
2934 if (!sm_supported(iommu)) {
2935 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2940 * Register a custom PASID allocator if we are running in a guest,
2941 * guest PASID must be obtained via virtual command interface.
2942 * There can be multiple vIOMMUs in each guest but only one allocator
2943 * is active. All vIOMMU allocators will eventually be calling the same
2946 if (!vccap_pasid(iommu->vccap))
2949 pr_info("Register custom PASID allocator\n");
2950 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2951 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2952 iommu->pasid_allocator.pdata = (void *)iommu;
2953 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2954 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2956 * Disable scalable mode on this IOMMU if there
2957 * is no custom allocator. Mixing SM capable vIOMMU
2958 * and non-SM vIOMMU are not supported.
2965 static int __init init_dmars(void)
2967 struct dmar_drhd_unit *drhd;
2968 struct intel_iommu *iommu;
2974 * initialize and program root entry to not present
2977 for_each_drhd_unit(drhd) {
2979 * lock not needed as this is only incremented in the single
2980 * threaded kernel __init code path all other access are read
2983 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2987 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
2990 /* Preallocate enough resources for IOMMU hot-addition */
2991 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2992 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2994 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3001 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3005 for_each_iommu(iommu, drhd) {
3006 if (drhd->ignored) {
3007 iommu_disable_translation(iommu);
3012 * Find the max pasid size of all IOMMU's in the system.
3013 * We need to ensure the system pasid table is no bigger
3014 * than the smallest supported.
3016 if (pasid_supported(iommu)) {
3017 u32 temp = 2 << ecap_pss(iommu->ecap);
3019 intel_pasid_max_id = min_t(u32, temp,
3020 intel_pasid_max_id);
3023 g_iommus[iommu->seq_id] = iommu;
3025 intel_iommu_init_qi(iommu);
3027 ret = iommu_init_domains(iommu);
3031 init_translation_status(iommu);
3033 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3034 iommu_disable_translation(iommu);
3035 clear_translation_pre_enabled(iommu);
3036 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3042 * we could share the same root & context tables
3043 * among all IOMMU's. Need to Split it later.
3045 ret = iommu_alloc_root_entry(iommu);
3049 if (translation_pre_enabled(iommu)) {
3050 pr_info("Translation already enabled - trying to copy translation structures\n");
3052 ret = copy_translation_tables(iommu);
3055 * We found the IOMMU with translation
3056 * enabled - but failed to copy over the
3057 * old root-entry table. Try to proceed
3058 * by disabling translation now and
3059 * allocating a clean root-entry table.
3060 * This might cause DMAR faults, but
3061 * probably the dump will still succeed.
3063 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3065 iommu_disable_translation(iommu);
3066 clear_translation_pre_enabled(iommu);
3068 pr_info("Copied translation tables from previous kernel for %s\n",
3073 if (!ecap_pass_through(iommu->ecap))
3074 hw_pass_through = 0;
3075 intel_svm_check(iommu);
3079 * Now that qi is enabled on all iommus, set the root entry and flush
3080 * caches. This is required on some Intel X58 chipsets, otherwise the
3081 * flush_context function will loop forever and the boot hangs.
3083 for_each_active_iommu(iommu, drhd) {
3084 iommu_flush_write_buffer(iommu);
3085 #ifdef CONFIG_INTEL_IOMMU_SVM
3086 register_pasid_allocator(iommu);
3088 iommu_set_root_entry(iommu);
3091 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3096 iommu_identity_mapping |= IDENTMAP_GFX;
3098 check_tylersburg_isoch();
3100 ret = si_domain_init(hw_pass_through);
3107 * global invalidate context cache
3108 * global invalidate iotlb
3109 * enable translation
3111 for_each_iommu(iommu, drhd) {
3112 if (drhd->ignored) {
3114 * we always have to disable PMRs or DMA may fail on
3118 iommu_disable_protect_mem_regions(iommu);
3122 iommu_flush_write_buffer(iommu);
3124 #ifdef CONFIG_INTEL_IOMMU_SVM
3125 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3127 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3128 * could cause possible lock race condition.
3130 up_write(&dmar_global_lock);
3131 ret = intel_svm_enable_prq(iommu);
3132 down_write(&dmar_global_lock);
3137 ret = dmar_set_interrupt(iommu);
3145 for_each_active_iommu(iommu, drhd) {
3146 disable_dmar_iommu(iommu);
3147 free_dmar_iommu(iommu);
3156 static void __init init_no_remapping_devices(void)
3158 struct dmar_drhd_unit *drhd;
3162 for_each_drhd_unit(drhd) {
3163 if (!drhd->include_all) {
3164 for_each_active_dev_scope(drhd->devices,
3165 drhd->devices_cnt, i, dev)
3167 /* ignore DMAR unit if no devices exist */
3168 if (i == drhd->devices_cnt)
3173 for_each_active_drhd_unit(drhd) {
3174 if (drhd->include_all)
3177 for_each_active_dev_scope(drhd->devices,
3178 drhd->devices_cnt, i, dev)
3179 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3181 if (i < drhd->devices_cnt)
3184 /* This IOMMU has *only* gfx devices. Either bypass it or
3185 set the gfx_mapped flag, as appropriate */
3186 drhd->gfx_dedicated = 1;
3192 #ifdef CONFIG_SUSPEND
3193 static int init_iommu_hw(void)
3195 struct dmar_drhd_unit *drhd;
3196 struct intel_iommu *iommu = NULL;
3198 for_each_active_iommu(iommu, drhd)
3200 dmar_reenable_qi(iommu);
3202 for_each_iommu(iommu, drhd) {
3203 if (drhd->ignored) {
3205 * we always have to disable PMRs or DMA may fail on
3209 iommu_disable_protect_mem_regions(iommu);
3213 iommu_flush_write_buffer(iommu);
3214 iommu_set_root_entry(iommu);
3215 iommu_enable_translation(iommu);
3216 iommu_disable_protect_mem_regions(iommu);
3222 static void iommu_flush_all(void)
3224 struct dmar_drhd_unit *drhd;
3225 struct intel_iommu *iommu;
3227 for_each_active_iommu(iommu, drhd) {
3228 iommu->flush.flush_context(iommu, 0, 0, 0,
3229 DMA_CCMD_GLOBAL_INVL);
3230 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3231 DMA_TLB_GLOBAL_FLUSH);
3235 static int iommu_suspend(void)
3237 struct dmar_drhd_unit *drhd;
3238 struct intel_iommu *iommu = NULL;
3241 for_each_active_iommu(iommu, drhd) {
3242 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3244 if (!iommu->iommu_state)
3250 for_each_active_iommu(iommu, drhd) {
3251 iommu_disable_translation(iommu);
3253 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3255 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3256 readl(iommu->reg + DMAR_FECTL_REG);
3257 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3258 readl(iommu->reg + DMAR_FEDATA_REG);
3259 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3260 readl(iommu->reg + DMAR_FEADDR_REG);
3261 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3262 readl(iommu->reg + DMAR_FEUADDR_REG);
3264 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3269 for_each_active_iommu(iommu, drhd)
3270 kfree(iommu->iommu_state);
3275 static void iommu_resume(void)
3277 struct dmar_drhd_unit *drhd;
3278 struct intel_iommu *iommu = NULL;
3281 if (init_iommu_hw()) {
3283 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3285 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3289 for_each_active_iommu(iommu, drhd) {
3291 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3293 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3294 iommu->reg + DMAR_FECTL_REG);
3295 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3296 iommu->reg + DMAR_FEDATA_REG);
3297 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3298 iommu->reg + DMAR_FEADDR_REG);
3299 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3300 iommu->reg + DMAR_FEUADDR_REG);
3302 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3305 for_each_active_iommu(iommu, drhd)
3306 kfree(iommu->iommu_state);
3309 static struct syscore_ops iommu_syscore_ops = {
3310 .resume = iommu_resume,
3311 .suspend = iommu_suspend,
3314 static void __init init_iommu_pm_ops(void)
3316 register_syscore_ops(&iommu_syscore_ops);
3320 static inline void init_iommu_pm_ops(void) {}
3321 #endif /* CONFIG_PM */
3323 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3325 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3326 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3327 rmrr->end_address <= rmrr->base_address ||
3328 arch_rmrr_sanity_check(rmrr))
3334 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3336 struct acpi_dmar_reserved_memory *rmrr;
3337 struct dmar_rmrr_unit *rmrru;
3339 rmrr = (struct acpi_dmar_reserved_memory *)header;
3340 if (rmrr_sanity_check(rmrr)) {
3342 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3343 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3344 rmrr->base_address, rmrr->end_address,
3345 dmi_get_system_info(DMI_BIOS_VENDOR),
3346 dmi_get_system_info(DMI_BIOS_VERSION),
3347 dmi_get_system_info(DMI_PRODUCT_VERSION));
3348 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3351 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3355 rmrru->hdr = header;
3357 rmrru->base_address = rmrr->base_address;
3358 rmrru->end_address = rmrr->end_address;
3360 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3361 ((void *)rmrr) + rmrr->header.length,
3362 &rmrru->devices_cnt);
3363 if (rmrru->devices_cnt && rmrru->devices == NULL)
3366 list_add(&rmrru->list, &dmar_rmrr_units);
3375 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3377 struct dmar_atsr_unit *atsru;
3378 struct acpi_dmar_atsr *tmp;
3380 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3382 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3383 if (atsr->segment != tmp->segment)
3385 if (atsr->header.length != tmp->header.length)
3387 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3394 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3396 struct acpi_dmar_atsr *atsr;
3397 struct dmar_atsr_unit *atsru;
3399 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3402 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3403 atsru = dmar_find_atsr(atsr);
3407 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3412 * If memory is allocated from slab by ACPI _DSM method, we need to
3413 * copy the memory content because the memory buffer will be freed
3416 atsru->hdr = (void *)(atsru + 1);
3417 memcpy(atsru->hdr, hdr, hdr->length);
3418 atsru->include_all = atsr->flags & 0x1;
3419 if (!atsru->include_all) {
3420 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3421 (void *)atsr + atsr->header.length,
3422 &atsru->devices_cnt);
3423 if (atsru->devices_cnt && atsru->devices == NULL) {
3429 list_add_rcu(&atsru->list, &dmar_atsr_units);
3434 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3436 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3440 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3442 struct acpi_dmar_atsr *atsr;
3443 struct dmar_atsr_unit *atsru;
3445 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3446 atsru = dmar_find_atsr(atsr);
3448 list_del_rcu(&atsru->list);
3450 intel_iommu_free_atsr(atsru);
3456 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3460 struct acpi_dmar_atsr *atsr;
3461 struct dmar_atsr_unit *atsru;
3463 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3464 atsru = dmar_find_atsr(atsr);
3468 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3469 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3477 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3479 struct dmar_satc_unit *satcu;
3480 struct acpi_dmar_satc *tmp;
3482 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3484 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3485 if (satc->segment != tmp->segment)
3487 if (satc->header.length != tmp->header.length)
3489 if (memcmp(satc, tmp, satc->header.length) == 0)
3496 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3498 struct acpi_dmar_satc *satc;
3499 struct dmar_satc_unit *satcu;
3501 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3504 satc = container_of(hdr, struct acpi_dmar_satc, header);
3505 satcu = dmar_find_satc(satc);
3509 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3513 satcu->hdr = (void *)(satcu + 1);
3514 memcpy(satcu->hdr, hdr, hdr->length);
3515 satcu->atc_required = satc->flags & 0x1;
3516 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3517 (void *)satc + satc->header.length,
3518 &satcu->devices_cnt);
3519 if (satcu->devices_cnt && !satcu->devices) {
3523 list_add_rcu(&satcu->list, &dmar_satc_units);
3528 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3531 struct intel_iommu *iommu = dmaru->iommu;
3533 if (g_iommus[iommu->seq_id])
3536 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3540 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3541 pr_warn("%s: Doesn't support hardware pass through.\n",
3546 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3547 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3548 pr_warn("%s: Doesn't support large page.\n",
3554 * Disable translation if already enabled prior to OS handover.
3556 if (iommu->gcmd & DMA_GCMD_TE)
3557 iommu_disable_translation(iommu);
3559 g_iommus[iommu->seq_id] = iommu;
3560 ret = iommu_init_domains(iommu);
3562 ret = iommu_alloc_root_entry(iommu);
3566 intel_svm_check(iommu);
3568 if (dmaru->ignored) {
3570 * we always have to disable PMRs or DMA may fail on this device
3573 iommu_disable_protect_mem_regions(iommu);
3577 intel_iommu_init_qi(iommu);
3578 iommu_flush_write_buffer(iommu);
3580 #ifdef CONFIG_INTEL_IOMMU_SVM
3581 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3582 ret = intel_svm_enable_prq(iommu);
3587 ret = dmar_set_interrupt(iommu);
3591 iommu_set_root_entry(iommu);
3592 iommu_enable_translation(iommu);
3594 iommu_disable_protect_mem_regions(iommu);
3598 disable_dmar_iommu(iommu);
3600 free_dmar_iommu(iommu);
3604 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3607 struct intel_iommu *iommu = dmaru->iommu;
3609 if (!intel_iommu_enabled)
3615 ret = intel_iommu_add(dmaru);
3617 disable_dmar_iommu(iommu);
3618 free_dmar_iommu(iommu);
3624 static void intel_iommu_free_dmars(void)
3626 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3627 struct dmar_atsr_unit *atsru, *atsr_n;
3628 struct dmar_satc_unit *satcu, *satc_n;
3630 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3631 list_del(&rmrru->list);
3632 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3636 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3637 list_del(&atsru->list);
3638 intel_iommu_free_atsr(atsru);
3640 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3641 list_del(&satcu->list);
3642 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3647 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3649 struct dmar_satc_unit *satcu;
3650 struct acpi_dmar_satc *satc;
3654 dev = pci_physfn(dev);
3657 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3658 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3659 if (satc->segment != pci_domain_nr(dev->bus))
3661 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3662 if (to_pci_dev(tmp) == dev)
3671 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3674 struct pci_bus *bus;
3675 struct pci_dev *bridge = NULL;
3677 struct acpi_dmar_atsr *atsr;
3678 struct dmar_atsr_unit *atsru;
3679 struct dmar_satc_unit *satcu;
3681 dev = pci_physfn(dev);
3682 satcu = dmar_find_matched_satc_unit(dev);
3685 * This device supports ATS as it is in SATC table.
3686 * When IOMMU is in legacy mode, enabling ATS is done
3687 * automatically by HW for the device that requires
3688 * ATS, hence OS should not enable this device ATS
3689 * to avoid duplicated TLB invalidation.
3691 return !(satcu->atc_required && !sm_supported(iommu));
3693 for (bus = dev->bus; bus; bus = bus->parent) {
3695 /* If it's an integrated device, allow ATS */
3698 /* Connected via non-PCIe: no ATS */
3699 if (!pci_is_pcie(bridge) ||
3700 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3702 /* If we found the root port, look it up in the ATSR */
3703 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3708 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3709 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3710 if (atsr->segment != pci_domain_nr(dev->bus))
3713 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3714 if (tmp == &bridge->dev)
3717 if (atsru->include_all)
3727 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3730 struct dmar_rmrr_unit *rmrru;
3731 struct dmar_atsr_unit *atsru;
3732 struct dmar_satc_unit *satcu;
3733 struct acpi_dmar_atsr *atsr;
3734 struct acpi_dmar_reserved_memory *rmrr;
3735 struct acpi_dmar_satc *satc;
3737 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3740 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3741 rmrr = container_of(rmrru->hdr,
3742 struct acpi_dmar_reserved_memory, header);
3743 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3744 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3745 ((void *)rmrr) + rmrr->header.length,
3746 rmrr->segment, rmrru->devices,
3747 rmrru->devices_cnt);
3750 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3751 dmar_remove_dev_scope(info, rmrr->segment,
3752 rmrru->devices, rmrru->devices_cnt);
3756 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3757 if (atsru->include_all)
3760 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3761 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3762 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3763 (void *)atsr + atsr->header.length,
3764 atsr->segment, atsru->devices,
3765 atsru->devices_cnt);
3770 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3771 if (dmar_remove_dev_scope(info, atsr->segment,
3772 atsru->devices, atsru->devices_cnt))
3776 list_for_each_entry(satcu, &dmar_satc_units, list) {
3777 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3778 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3779 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3780 (void *)satc + satc->header.length,
3781 satc->segment, satcu->devices,
3782 satcu->devices_cnt);
3787 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3788 if (dmar_remove_dev_scope(info, satc->segment,
3789 satcu->devices, satcu->devices_cnt))
3797 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3798 unsigned long val, void *v)
3800 struct memory_notify *mhp = v;
3801 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3802 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3806 case MEM_GOING_ONLINE:
3807 if (iommu_domain_identity_map(si_domain,
3808 start_vpfn, last_vpfn)) {
3809 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3810 start_vpfn, last_vpfn);
3816 case MEM_CANCEL_ONLINE:
3818 struct dmar_drhd_unit *drhd;
3819 struct intel_iommu *iommu;
3820 LIST_HEAD(freelist);
3822 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3825 for_each_active_iommu(iommu, drhd)
3826 iommu_flush_iotlb_psi(iommu, si_domain,
3827 start_vpfn, mhp->nr_pages,
3828 list_empty(&freelist), 0);
3830 put_pages_list(&freelist);
3838 static struct notifier_block intel_iommu_memory_nb = {
3839 .notifier_call = intel_iommu_memory_notifier,
3843 static void intel_disable_iommus(void)
3845 struct intel_iommu *iommu = NULL;
3846 struct dmar_drhd_unit *drhd;
3848 for_each_iommu(iommu, drhd)
3849 iommu_disable_translation(iommu);
3852 void intel_iommu_shutdown(void)
3854 struct dmar_drhd_unit *drhd;
3855 struct intel_iommu *iommu = NULL;
3857 if (no_iommu || dmar_disabled)
3860 down_write(&dmar_global_lock);
3862 /* Disable PMRs explicitly here. */
3863 for_each_iommu(iommu, drhd)
3864 iommu_disable_protect_mem_regions(iommu);
3866 /* Make sure the IOMMUs are switched off */
3867 intel_disable_iommus();
3869 up_write(&dmar_global_lock);
3872 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3874 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3876 return container_of(iommu_dev, struct intel_iommu, iommu);
3879 static ssize_t version_show(struct device *dev,
3880 struct device_attribute *attr, char *buf)
3882 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3883 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3884 return sprintf(buf, "%d:%d\n",
3885 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3887 static DEVICE_ATTR_RO(version);
3889 static ssize_t address_show(struct device *dev,
3890 struct device_attribute *attr, char *buf)
3892 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3893 return sprintf(buf, "%llx\n", iommu->reg_phys);
3895 static DEVICE_ATTR_RO(address);
3897 static ssize_t cap_show(struct device *dev,
3898 struct device_attribute *attr, char *buf)
3900 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3901 return sprintf(buf, "%llx\n", iommu->cap);
3903 static DEVICE_ATTR_RO(cap);
3905 static ssize_t ecap_show(struct device *dev,
3906 struct device_attribute *attr, char *buf)
3908 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3909 return sprintf(buf, "%llx\n", iommu->ecap);
3911 static DEVICE_ATTR_RO(ecap);
3913 static ssize_t domains_supported_show(struct device *dev,
3914 struct device_attribute *attr, char *buf)
3916 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3917 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3919 static DEVICE_ATTR_RO(domains_supported);
3921 static ssize_t domains_used_show(struct device *dev,
3922 struct device_attribute *attr, char *buf)
3924 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3925 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3926 cap_ndoms(iommu->cap)));
3928 static DEVICE_ATTR_RO(domains_used);
3930 static struct attribute *intel_iommu_attrs[] = {
3931 &dev_attr_version.attr,
3932 &dev_attr_address.attr,
3934 &dev_attr_ecap.attr,
3935 &dev_attr_domains_supported.attr,
3936 &dev_attr_domains_used.attr,
3940 static struct attribute_group intel_iommu_group = {
3941 .name = "intel-iommu",
3942 .attrs = intel_iommu_attrs,
3945 const struct attribute_group *intel_iommu_groups[] = {
3950 static inline bool has_external_pci(void)
3952 struct pci_dev *pdev = NULL;
3954 for_each_pci_dev(pdev)
3955 if (pdev->external_facing)
3961 static int __init platform_optin_force_iommu(void)
3963 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3966 if (no_iommu || dmar_disabled)
3967 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3970 * If Intel-IOMMU is disabled by default, we will apply identity
3971 * map for all devices except those marked as being untrusted.
3974 iommu_set_default_passthrough(false);
3982 static int __init probe_acpi_namespace_devices(void)
3984 struct dmar_drhd_unit *drhd;
3985 /* To avoid a -Wunused-but-set-variable warning. */
3986 struct intel_iommu *iommu __maybe_unused;
3990 for_each_active_iommu(iommu, drhd) {
3991 for_each_active_dev_scope(drhd->devices,
3992 drhd->devices_cnt, i, dev) {
3993 struct acpi_device_physical_node *pn;
3994 struct iommu_group *group;
3995 struct acpi_device *adev;
3997 if (dev->bus != &acpi_bus_type)
4000 adev = to_acpi_device(dev);
4001 mutex_lock(&adev->physical_node_lock);
4002 list_for_each_entry(pn,
4003 &adev->physical_node_list, node) {
4004 group = iommu_group_get(pn->dev);
4006 iommu_group_put(group);
4010 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4011 ret = iommu_probe_device(pn->dev);
4015 mutex_unlock(&adev->physical_node_lock);
4025 int __init intel_iommu_init(void)
4028 struct dmar_drhd_unit *drhd;
4029 struct intel_iommu *iommu;
4032 * Intel IOMMU is required for a TXT/tboot launch or platform
4033 * opt in, so enforce that.
4035 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4036 platform_optin_force_iommu();
4038 down_write(&dmar_global_lock);
4039 if (dmar_table_init()) {
4041 panic("tboot: Failed to initialize DMAR table\n");
4045 if (dmar_dev_scope_init() < 0) {
4047 panic("tboot: Failed to initialize DMAR device scope\n");
4051 up_write(&dmar_global_lock);
4054 * The bus notifier takes the dmar_global_lock, so lockdep will
4055 * complain later when we register it under the lock.
4057 dmar_register_bus_notifier();
4059 down_write(&dmar_global_lock);
4062 intel_iommu_debugfs_init();
4064 if (no_iommu || dmar_disabled) {
4066 * We exit the function here to ensure IOMMU's remapping and
4067 * mempool aren't setup, which means that the IOMMU's PMRs
4068 * won't be disabled via the call to init_dmars(). So disable
4069 * it explicitly here. The PMRs were setup by tboot prior to
4070 * calling SENTER, but the kernel is expected to reset/tear
4073 if (intel_iommu_tboot_noforce) {
4074 for_each_iommu(iommu, drhd)
4075 iommu_disable_protect_mem_regions(iommu);
4079 * Make sure the IOMMUs are switched off, even when we
4080 * boot into a kexec kernel and the previous kernel left
4083 intel_disable_iommus();
4087 if (list_empty(&dmar_rmrr_units))
4088 pr_info("No RMRR found\n");
4090 if (list_empty(&dmar_atsr_units))
4091 pr_info("No ATSR found\n");
4093 if (list_empty(&dmar_satc_units))
4094 pr_info("No SATC found\n");
4097 intel_iommu_gfx_mapped = 1;
4099 init_no_remapping_devices();
4104 panic("tboot: Failed to initialize DMARs\n");
4105 pr_err("Initialization failed\n");
4108 up_write(&dmar_global_lock);
4110 init_iommu_pm_ops();
4112 down_read(&dmar_global_lock);
4113 for_each_active_iommu(iommu, drhd) {
4115 * The flush queue implementation does not perform
4116 * page-selective invalidations that are required for efficient
4117 * TLB flushes in virtual environments. The benefit of batching
4118 * is likely to be much lower than the overhead of synchronizing
4119 * the virtual and physical IOMMU page-tables.
4121 if (cap_caching_mode(iommu->cap)) {
4122 pr_info_once("IOMMU batching disallowed due to virtualization\n");
4123 iommu_set_dma_strict();
4125 iommu_device_sysfs_add(&iommu->iommu, NULL,
4128 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4130 up_read(&dmar_global_lock);
4132 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4133 if (si_domain && !hw_pass_through)
4134 register_memory_notifier(&intel_iommu_memory_nb);
4136 down_read(&dmar_global_lock);
4137 if (probe_acpi_namespace_devices())
4138 pr_warn("ACPI name space devices didn't probe correctly\n");
4140 /* Finally, we enable the DMA remapping hardware. */
4141 for_each_iommu(iommu, drhd) {
4142 if (!drhd->ignored && !translation_pre_enabled(iommu))
4143 iommu_enable_translation(iommu);
4145 iommu_disable_protect_mem_regions(iommu);
4147 up_read(&dmar_global_lock);
4149 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4151 intel_iommu_enabled = 1;
4156 intel_iommu_free_dmars();
4157 up_write(&dmar_global_lock);
4161 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4163 struct device_domain_info *info = opaque;
4165 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4170 * NB - intel-iommu lacks any sort of reference counting for the users of
4171 * dependent devices. If multiple endpoints have intersecting dependent
4172 * devices, unbinding the driver from any one of them will possibly leave
4173 * the others unable to operate.
4175 static void domain_context_clear(struct device_domain_info *info)
4177 if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4180 pci_for_each_dma_alias(to_pci_dev(info->dev),
4181 &domain_context_clear_one_cb, info);
4184 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4186 struct dmar_domain *domain;
4187 struct intel_iommu *iommu;
4188 unsigned long flags;
4190 assert_spin_locked(&device_domain_lock);
4195 iommu = info->iommu;
4196 domain = info->domain;
4198 if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4199 if (dev_is_pci(info->dev) && sm_supported(iommu))
4200 intel_pasid_tear_down_entry(iommu, info->dev,
4201 PASID_RID2PASID, false);
4203 iommu_disable_dev_iotlb(info);
4204 domain_context_clear(info);
4205 intel_pasid_free_table(info->dev);
4208 list_del(&info->link);
4210 spin_lock_irqsave(&iommu->lock, flags);
4211 domain_detach_iommu(domain, iommu);
4212 spin_unlock_irqrestore(&iommu->lock, flags);
4215 static void dmar_remove_one_dev_info(struct device *dev)
4217 struct device_domain_info *info;
4218 unsigned long flags;
4220 spin_lock_irqsave(&device_domain_lock, flags);
4221 info = dev_iommu_priv_get(dev);
4223 __dmar_remove_one_dev_info(info);
4224 spin_unlock_irqrestore(&device_domain_lock, flags);
4227 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4231 /* calculate AGAW */
4232 domain->gaw = guest_width;
4233 adjust_width = guestwidth_to_adjustwidth(guest_width);
4234 domain->agaw = width_to_agaw(adjust_width);
4236 domain->iommu_coherency = false;
4237 domain->iommu_superpage = 0;
4238 domain->max_addr = 0;
4240 /* always allocate the top pgd */
4241 domain->pgd = alloc_pgtable_page(domain->nid);
4244 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4248 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4250 struct dmar_domain *dmar_domain;
4251 struct iommu_domain *domain;
4254 case IOMMU_DOMAIN_DMA:
4255 case IOMMU_DOMAIN_DMA_FQ:
4256 case IOMMU_DOMAIN_UNMANAGED:
4257 dmar_domain = alloc_domain(type);
4259 pr_err("Can't allocate dmar_domain\n");
4262 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4263 pr_err("Domain initialization failed\n");
4264 domain_exit(dmar_domain);
4268 domain = &dmar_domain->domain;
4269 domain->geometry.aperture_start = 0;
4270 domain->geometry.aperture_end =
4271 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4272 domain->geometry.force_aperture = true;
4275 case IOMMU_DOMAIN_IDENTITY:
4276 return &si_domain->domain;
4284 static void intel_iommu_domain_free(struct iommu_domain *domain)
4286 if (domain != &si_domain->domain)
4287 domain_exit(to_dmar_domain(domain));
4290 static int prepare_domain_attach_device(struct iommu_domain *domain,
4293 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4294 struct intel_iommu *iommu;
4297 iommu = device_to_iommu(dev, NULL, NULL);
4301 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4304 /* check if this iommu agaw is sufficient for max mapped address */
4305 addr_width = agaw_to_width(iommu->agaw);
4306 if (addr_width > cap_mgaw(iommu->cap))
4307 addr_width = cap_mgaw(iommu->cap);
4309 if (dmar_domain->max_addr > (1LL << addr_width)) {
4310 dev_err(dev, "%s: iommu width (%d) is not "
4311 "sufficient for the mapped address (%llx)\n",
4312 __func__, addr_width, dmar_domain->max_addr);
4315 dmar_domain->gaw = addr_width;
4318 * Knock out extra levels of page tables if necessary
4320 while (iommu->agaw < dmar_domain->agaw) {
4321 struct dma_pte *pte;
4323 pte = dmar_domain->pgd;
4324 if (dma_pte_present(pte)) {
4325 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4326 free_pgtable_page(pte);
4328 dmar_domain->agaw--;
4334 static int intel_iommu_attach_device(struct iommu_domain *domain,
4339 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4340 device_is_rmrr_locked(dev)) {
4341 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4345 /* normally dev is not mapped */
4346 if (unlikely(domain_context_mapped(dev))) {
4347 struct device_domain_info *info = dev_iommu_priv_get(dev);
4350 dmar_remove_one_dev_info(dev);
4353 ret = prepare_domain_attach_device(domain, dev);
4357 return domain_add_dev_info(to_dmar_domain(domain), dev);
4360 static void intel_iommu_detach_device(struct iommu_domain *domain,
4363 dmar_remove_one_dev_info(dev);
4366 static int intel_iommu_map(struct iommu_domain *domain,
4367 unsigned long iova, phys_addr_t hpa,
4368 size_t size, int iommu_prot, gfp_t gfp)
4370 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4374 if (iommu_prot & IOMMU_READ)
4375 prot |= DMA_PTE_READ;
4376 if (iommu_prot & IOMMU_WRITE)
4377 prot |= DMA_PTE_WRITE;
4378 if (dmar_domain->set_pte_snp)
4379 prot |= DMA_PTE_SNP;
4381 max_addr = iova + size;
4382 if (dmar_domain->max_addr < max_addr) {
4385 /* check if minimum agaw is sufficient for mapped address */
4386 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4387 if (end < max_addr) {
4388 pr_err("%s: iommu width (%d) is not "
4389 "sufficient for the mapped address (%llx)\n",
4390 __func__, dmar_domain->gaw, max_addr);
4393 dmar_domain->max_addr = max_addr;
4395 /* Round up size to next multiple of PAGE_SIZE, if it and
4396 the low bits of hpa would take us onto the next page */
4397 size = aligned_nrpages(hpa, size);
4398 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4399 hpa >> VTD_PAGE_SHIFT, size, prot);
4402 static int intel_iommu_map_pages(struct iommu_domain *domain,
4403 unsigned long iova, phys_addr_t paddr,
4404 size_t pgsize, size_t pgcount,
4405 int prot, gfp_t gfp, size_t *mapped)
4407 unsigned long pgshift = __ffs(pgsize);
4408 size_t size = pgcount << pgshift;
4411 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4414 if (!IS_ALIGNED(iova | paddr, pgsize))
4417 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4424 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4425 unsigned long iova, size_t size,
4426 struct iommu_iotlb_gather *gather)
4428 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4429 unsigned long start_pfn, last_pfn;
4432 /* Cope with horrid API which requires us to unmap more than the
4433 size argument if it happens to be a large-page mapping. */
4434 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4436 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4437 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4439 start_pfn = iova >> VTD_PAGE_SHIFT;
4440 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4442 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4444 if (dmar_domain->max_addr == iova + size)
4445 dmar_domain->max_addr = iova;
4447 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4452 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4454 size_t pgsize, size_t pgcount,
4455 struct iommu_iotlb_gather *gather)
4457 unsigned long pgshift = __ffs(pgsize);
4458 size_t size = pgcount << pgshift;
4460 return intel_iommu_unmap(domain, iova, size, gather);
4463 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4464 struct iommu_iotlb_gather *gather)
4466 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4467 unsigned long iova_pfn = IOVA_PFN(gather->start);
4468 size_t size = gather->end - gather->start;
4469 unsigned long start_pfn;
4470 unsigned long nrpages;
4473 nrpages = aligned_nrpages(gather->start, size);
4474 start_pfn = mm_to_dma_pfn(iova_pfn);
4476 for_each_domain_iommu(iommu_id, dmar_domain)
4477 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4479 list_empty(&gather->freelist), 0);
4481 put_pages_list(&gather->freelist);
4484 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4487 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4488 struct dma_pte *pte;
4492 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4493 if (pte && dma_pte_present(pte))
4494 phys = dma_pte_addr(pte) +
4495 (iova & (BIT_MASK(level_to_offset_bits(level) +
4496 VTD_PAGE_SHIFT) - 1));
4501 static bool domain_support_force_snooping(struct dmar_domain *domain)
4503 struct device_domain_info *info;
4504 bool support = true;
4506 assert_spin_locked(&device_domain_lock);
4507 list_for_each_entry(info, &domain->devices, link) {
4508 if (!ecap_sc_support(info->iommu->ecap)) {
4517 static void domain_set_force_snooping(struct dmar_domain *domain)
4519 struct device_domain_info *info;
4521 assert_spin_locked(&device_domain_lock);
4524 * Second level page table supports per-PTE snoop control. The
4525 * iommu_map() interface will handle this by setting SNP bit.
4527 if (!domain_use_first_level(domain)) {
4528 domain->set_pte_snp = true;
4532 list_for_each_entry(info, &domain->devices, link)
4533 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4537 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4539 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4540 unsigned long flags;
4542 if (dmar_domain->force_snooping)
4545 spin_lock_irqsave(&device_domain_lock, flags);
4546 if (!domain_support_force_snooping(dmar_domain)) {
4547 spin_unlock_irqrestore(&device_domain_lock, flags);
4551 domain_set_force_snooping(dmar_domain);
4552 dmar_domain->force_snooping = true;
4553 spin_unlock_irqrestore(&device_domain_lock, flags);
4558 static bool intel_iommu_capable(enum iommu_cap cap)
4560 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4562 if (cap == IOMMU_CAP_INTR_REMAP)
4563 return irq_remapping_enabled == 1;
4564 if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4565 return dmar_platform_optin();
4570 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4572 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4573 struct device_domain_info *info;
4574 struct intel_iommu *iommu;
4575 unsigned long flags;
4578 iommu = device_to_iommu(dev, &bus, &devfn);
4580 return ERR_PTR(-ENODEV);
4582 info = kzalloc(sizeof(*info), GFP_KERNEL);
4584 return ERR_PTR(-ENOMEM);
4586 if (dev_is_real_dma_subdevice(dev)) {
4587 info->bus = pdev->bus->number;
4588 info->devfn = pdev->devfn;
4589 info->segment = pci_domain_nr(pdev->bus);
4592 info->devfn = devfn;
4593 info->segment = iommu->segment;
4597 info->iommu = iommu;
4598 if (dev_is_pci(dev)) {
4599 if (ecap_dev_iotlb_support(iommu->ecap) &&
4600 pci_ats_supported(pdev) &&
4601 dmar_ats_supported(pdev, iommu))
4602 info->ats_supported = 1;
4604 if (sm_supported(iommu)) {
4605 if (pasid_supported(iommu)) {
4606 int features = pci_pasid_features(pdev);
4609 info->pasid_supported = features | 1;
4612 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4613 pci_pri_supported(pdev))
4614 info->pri_supported = 1;
4618 spin_lock_irqsave(&device_domain_lock, flags);
4619 list_add(&info->global, &device_domain_list);
4620 dev_iommu_priv_set(dev, info);
4621 spin_unlock_irqrestore(&device_domain_lock, flags);
4623 return &iommu->iommu;
4626 static void intel_iommu_release_device(struct device *dev)
4628 struct device_domain_info *info = dev_iommu_priv_get(dev);
4629 unsigned long flags;
4631 dmar_remove_one_dev_info(dev);
4633 spin_lock_irqsave(&device_domain_lock, flags);
4634 dev_iommu_priv_set(dev, NULL);
4635 list_del(&info->global);
4636 spin_unlock_irqrestore(&device_domain_lock, flags);
4639 set_dma_ops(dev, NULL);
4642 static void intel_iommu_probe_finalize(struct device *dev)
4644 set_dma_ops(dev, NULL);
4645 iommu_setup_dma_ops(dev, 0, U64_MAX);
4648 static void intel_iommu_get_resv_regions(struct device *device,
4649 struct list_head *head)
4651 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4652 struct iommu_resv_region *reg;
4653 struct dmar_rmrr_unit *rmrr;
4654 struct device *i_dev;
4657 down_read(&dmar_global_lock);
4658 for_each_rmrr_units(rmrr) {
4659 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4661 struct iommu_resv_region *resv;
4662 enum iommu_resv_type type;
4665 if (i_dev != device &&
4666 !is_downstream_to_pci_bridge(device, i_dev))
4669 length = rmrr->end_address - rmrr->base_address + 1;
4671 type = device_rmrr_is_relaxable(device) ?
4672 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4674 resv = iommu_alloc_resv_region(rmrr->base_address,
4675 length, prot, type);
4679 list_add_tail(&resv->list, head);
4682 up_read(&dmar_global_lock);
4684 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4685 if (dev_is_pci(device)) {
4686 struct pci_dev *pdev = to_pci_dev(device);
4688 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4689 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4690 IOMMU_RESV_DIRECT_RELAXABLE);
4692 list_add_tail(®->list, head);
4695 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4697 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4698 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4702 list_add_tail(®->list, head);
4705 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4707 struct device_domain_info *info = dev_iommu_priv_get(dev);
4708 struct context_entry *context;
4709 struct dmar_domain *domain;
4710 unsigned long flags;
4714 domain = info->domain;
4718 spin_lock_irqsave(&device_domain_lock, flags);
4719 spin_lock(&iommu->lock);
4722 if (!info->pasid_supported)
4725 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4726 if (WARN_ON(!context))
4729 ctx_lo = context[0].lo;
4731 if (!(ctx_lo & CONTEXT_PASIDE)) {
4732 ctx_lo |= CONTEXT_PASIDE;
4733 context[0].lo = ctx_lo;
4735 iommu->flush.flush_context(iommu,
4736 domain->iommu_did[iommu->seq_id],
4737 PCI_DEVID(info->bus, info->devfn),
4738 DMA_CCMD_MASK_NOBIT,
4739 DMA_CCMD_DEVICE_INVL);
4742 /* Enable PASID support in the device, if it wasn't already */
4743 if (!info->pasid_enabled)
4744 iommu_enable_dev_iotlb(info);
4749 spin_unlock(&iommu->lock);
4750 spin_unlock_irqrestore(&device_domain_lock, flags);
4755 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4757 if (dev_is_pci(dev))
4758 return pci_device_group(dev);
4759 return generic_device_group(dev);
4762 static int intel_iommu_enable_sva(struct device *dev)
4764 struct device_domain_info *info = dev_iommu_priv_get(dev);
4765 struct intel_iommu *iommu;
4768 if (!info || dmar_disabled)
4771 iommu = info->iommu;
4775 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4778 if (intel_iommu_enable_pasid(iommu, dev))
4781 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4784 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4786 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4791 static int intel_iommu_disable_sva(struct device *dev)
4793 struct device_domain_info *info = dev_iommu_priv_get(dev);
4794 struct intel_iommu *iommu = info->iommu;
4797 ret = iommu_unregister_device_fault_handler(dev);
4799 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4804 static int intel_iommu_enable_iopf(struct device *dev)
4806 struct device_domain_info *info = dev_iommu_priv_get(dev);
4808 if (info && info->pri_supported)
4815 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4818 case IOMMU_DEV_FEAT_IOPF:
4819 return intel_iommu_enable_iopf(dev);
4821 case IOMMU_DEV_FEAT_SVA:
4822 return intel_iommu_enable_sva(dev);
4830 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4833 case IOMMU_DEV_FEAT_IOPF:
4836 case IOMMU_DEV_FEAT_SVA:
4837 return intel_iommu_disable_sva(dev);
4844 static bool intel_iommu_is_attach_deferred(struct device *dev)
4846 struct device_domain_info *info = dev_iommu_priv_get(dev);
4848 return translation_pre_enabled(info->iommu) && !info->domain;
4852 * Check that the device does not live on an external facing PCI port that is
4853 * marked as untrusted. Such devices should not be able to apply quirks and
4854 * thus not be able to bypass the IOMMU restrictions.
4856 static bool risky_device(struct pci_dev *pdev)
4858 if (pdev->untrusted) {
4860 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4861 pdev->vendor, pdev->device);
4862 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4868 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4869 unsigned long iova, size_t size)
4871 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4872 unsigned long pages = aligned_nrpages(iova, size);
4873 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4874 struct intel_iommu *iommu;
4877 for_each_domain_iommu(iommu_id, dmar_domain) {
4878 iommu = g_iommus[iommu_id];
4879 __mapping_notify_one(iommu, dmar_domain, pfn, pages);
4883 const struct iommu_ops intel_iommu_ops = {
4884 .capable = intel_iommu_capable,
4885 .domain_alloc = intel_iommu_domain_alloc,
4886 .probe_device = intel_iommu_probe_device,
4887 .probe_finalize = intel_iommu_probe_finalize,
4888 .release_device = intel_iommu_release_device,
4889 .get_resv_regions = intel_iommu_get_resv_regions,
4890 .put_resv_regions = generic_iommu_put_resv_regions,
4891 .device_group = intel_iommu_device_group,
4892 .dev_enable_feat = intel_iommu_dev_enable_feat,
4893 .dev_disable_feat = intel_iommu_dev_disable_feat,
4894 .is_attach_deferred = intel_iommu_is_attach_deferred,
4895 .def_domain_type = device_def_domain_type,
4896 .pgsize_bitmap = SZ_4K,
4897 #ifdef CONFIG_INTEL_IOMMU_SVM
4898 .sva_bind = intel_svm_bind,
4899 .sva_unbind = intel_svm_unbind,
4900 .sva_get_pasid = intel_svm_get_pasid,
4901 .page_response = intel_svm_page_response,
4903 .default_domain_ops = &(const struct iommu_domain_ops) {
4904 .attach_dev = intel_iommu_attach_device,
4905 .detach_dev = intel_iommu_detach_device,
4906 .map_pages = intel_iommu_map_pages,
4907 .unmap_pages = intel_iommu_unmap_pages,
4908 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4909 .flush_iotlb_all = intel_flush_iotlb_all,
4910 .iotlb_sync = intel_iommu_tlb_sync,
4911 .iova_to_phys = intel_iommu_iova_to_phys,
4912 .free = intel_iommu_domain_free,
4913 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4917 static void quirk_iommu_igfx(struct pci_dev *dev)
4919 if (risky_device(dev))
4922 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4926 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4935 /* Broadwell igfx malfunctions with dmar */
4936 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4942 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4945 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4947 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4948 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4949 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4950 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4954 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4955 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4961 static void quirk_iommu_rwbf(struct pci_dev *dev)
4963 if (risky_device(dev))
4967 * Mobile 4 Series Chipset neglects to set RWBF capability,
4968 * but needs it. Same seems to hold for the desktop versions.
4970 pci_info(dev, "Forcing write-buffer flush capability\n");
4974 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4975 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4976 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4977 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4978 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4979 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4980 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4983 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4984 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4985 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4986 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4987 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4988 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4989 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4990 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4992 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4996 if (risky_device(dev))
4999 if (pci_read_config_word(dev, GGC, &ggc))
5002 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5003 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5005 } else if (dmar_map_gfx) {
5006 /* we have to ensure the gfx device is idle before we flush */
5007 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5008 iommu_set_dma_strict();
5011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5014 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5016 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5020 if (!IS_GFX_DEVICE(dev))
5023 ver = (dev->device >> 8) & 0xff;
5024 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5025 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5026 ver != 0x9a && ver != 0xa7)
5029 if (risky_device(dev))
5032 pci_info(dev, "Skip IOMMU disabling for graphics\n");
5033 iommu_skip_te_disable = 1;
5035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5037 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5038 ISOCH DMAR unit for the Azalia sound device, but not give it any
5039 TLB entries, which causes it to deadlock. Check for that. We do
5040 this in a function called from init_dmars(), instead of in a PCI
5041 quirk, because we don't want to print the obnoxious "BIOS broken"
5042 message if VT-d is actually disabled.
5044 static void __init check_tylersburg_isoch(void)
5046 struct pci_dev *pdev;
5047 uint32_t vtisochctrl;
5049 /* If there's no Azalia in the system anyway, forget it. */
5050 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5054 if (risky_device(pdev)) {
5061 /* System Management Registers. Might be hidden, in which case
5062 we can't do the sanity check. But that's OK, because the
5063 known-broken BIOSes _don't_ actually hide it, so far. */
5064 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5068 if (risky_device(pdev)) {
5073 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5080 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5081 if (vtisochctrl & 1)
5084 /* Drop all bits other than the number of TLB entries */
5085 vtisochctrl &= 0x1c;
5087 /* If we have the recommended number of TLB entries (16), fine. */
5088 if (vtisochctrl == 0x10)
5091 /* Zero TLB entries? You get to ride the short bus to school. */
5093 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5094 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5095 dmi_get_system_info(DMI_BIOS_VENDOR),
5096 dmi_get_system_info(DMI_BIOS_VERSION),
5097 dmi_get_system_info(DMI_PRODUCT_VERSION));
5098 iommu_identity_mapping |= IDENTMAP_AZALIA;
5102 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",